root/branches/mbutscher/work/lib/whoosh/formats.py @ 234

Revision 234, 17.0 kB (checked in by mbutscher, 2 years ago)

branches/stable-2.0:
* Bug fixed: Windows: "Invalid handle" error on

command line bridge insertion plugins

branches/mbutscher/work:
* Write last writing program version into wiki db
* Several bug fixes with whoosh (index search)
* Index search: Highlight found terms and jump to

one found term on double-click

* Bug fixed: Windows: "Invalid handle" error on

command line bridge insertion plugins

Line 
1#===============================================================================
2# Copyright 2009 Matt Chaput
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#    http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#===============================================================================
16
17"""
18The classes in this module encode and decode posting information for a field.
19The field format essentially determines what information is stored about each
20occurance of a term.
21"""
22
23from collections import defaultdict
24from struct import Struct
25from cPickle import dumps, loads
26from cStringIO import StringIO
27
28from whoosh.analysis import unstopped
29from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_uint, unpack_uint,
30                           pack_float, unpack_float)
31from whoosh.util import varint, read_varint, float_to_byte, byte_to_float
32
33
34# Format base class
35
36class Format(object):
37    """Abstract base class representing a storage format for a field or vector.
38    Format objects are responsible for writing and reading the low-level
39    representation of a field. It controls what kind/level of information to
40    store about the indexed fields.
41    """
42   
43    posting_size = -1
44    textual = True
45    __inittypes__ = dict(analyzer=object, field_boost=float)
46   
47    def __init__(self, analyzer, field_boost=1.0, **options):
48        """
49        :param analyzer: The analysis.Analyzer object to use to index this
50            field. See the analysis module for more information. If this value
51            is None, the field is not indexed/searchable.
52        :param field_boost: A constant boost factor to scale to the score
53            of all queries matching terms in this field.
54        """
55       
56        self.analyzer = analyzer
57        self.field_boost = field_boost
58        self.options = options
59   
60    def __eq__(self, other):
61        return (other
62                and self.__class__ is other.__class__
63                and self.__dict__ == other.__dict__)
64   
65    def __repr__(self):
66        return "%s(%r, boost = %s)" % (self.__class__.__name__,
67                                       self.analyzer, self.field_boost)
68   
69    def clean(self):
70        if self.analyzer and hasattr(self.analyzer, "clean"):
71            self.analyzer.clean()
72   
73    def word_values(self, value, **kwargs):
74        """Takes the text value to be indexed and yields a series of
75        ("tokentext", frequency, weight, valuestring) tuples, where frequency
76        is the number of times "tokentext" appeared in the value, weight is the
77        weight (a float usually equal to frequency in the absence of per-term
78        boosts) and valuestring is encoded field-specific posting value for the
79        token. For example, in a Frequency format, the value string would be
80        the same as frequency; in a Positions format, the value string would
81        encode a list of token positions at which "tokentext" occured.
82       
83        :param value: The unicode text to index.
84        """
85        raise NotImplementedError
86   
87    def analyze(self, unicodestring, mode='', **kwargs):
88        """Returns a :class:`whoosh.analysis.Token` iterator from the given
89        unicode string.
90       
91        :param unicodestring: the string to analyzer.
92        :param mode: a string indicating the purpose for which the unicode
93            string is being analyzed, i.e. 'index' or 'query'.
94        """
95       
96        if not self.analyzer:
97            raise Exception("%s format has no analyzer" % self.__class__)
98        return self.analyzer(unicodestring, mode=mode, **kwargs)
99   
100    def encode(self, value):
101        """Returns the given value encoded as a string.
102        """
103        raise NotImplementedError
104   
105    def supports(self, name):
106        """Returns True if this format supports interpreting its posting
107        value as 'name' (e.g. "frequency" or "positions").
108        """
109        return hasattr(self, "decode_" + name)
110   
111    def decoder(self, name):
112        """Returns the bound method for interpreting value as 'name',
113        where 'name' is for example "frequency" or "positions". This
114        object must have a corresponding Format.decode_<name>() method.
115        """
116        return getattr(self, "decode_" + name)
117   
118    def decode_as(self, astype, valuestring):
119        """Interprets the encoded value string as 'astype', where 'astype' is
120        for example "frequency" or "positions". This object must have a
121        corresponding decode_<astype>() method.
122        """
123        return self.decoder(astype)(valuestring)
124   
125
126# Concrete field classes
127
128class Existence(Format):
129    """Only indexes whether a given term occurred in a given document; it does
130    not store frequencies or positions. This is useful for fields that should
131    be searchable but not scorable, such as file path.
132   
133    Supports: frequency, weight (always reports frequency = 1).
134    """
135   
136    posting_size = 0
137    __inittypes__ = dict(analyzer=object, field_boost=float)
138   
139    def __init__(self, analyzer, field_boost=1.0, **options):
140        self.analyzer = analyzer
141        self.field_boost = field_boost
142        self.options = options
143   
144    def word_values(self, value, **kwargs):
145        wordset = set(t.text for t
146                      in unstopped(self.analyzer(value, **kwargs)))
147        return ((w, 1, 1.0, '') for w in wordset)
148   
149    def encode(self, value):
150        return ''
151   
152    def decode_frequency(self, valuestring):
153        return 1
154   
155    def decode_weight(self, valuestring):
156        return self.field_boost
157
158
159class Frequency(Format):
160    """Stores frequency information for each posting.
161   
162    Supports: frequency, weight.
163    """
164   
165    posting_size = _INT_SIZE
166    __inittypes__ = dict(analyzer=object, field_boost=float,
167                         boost_as_freq=bool)
168   
169    def __init__(self, analyzer, field_boost=1.0, boost_as_freq=False,
170                 **options):
171        """
172        :param analyzer: The analysis.Analyzer object to use to index this
173            field. See the analysis module for more information. If this value
174            is None, the field is not indexed/searchable.
175        :param field_boost: A constant boost factor to scale to the score of
176            all queries matching terms in this field.
177        """
178       
179        self.analyzer = analyzer
180        self.field_boost = field_boost
181        self.options = options
182       
183    def word_values(self, value, **kwargs):
184        freqs = defaultdict(int)
185        weights = defaultdict(float)
186       
187        for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
188            freqs[t.text] += 1
189            weights[t.text] += t.boost
190       
191        encode = self.encode
192        return ((w, freq, weights[w], encode(freq))
193                for w, freq in freqs.iteritems())
194
195    def encode(self, freq):
196        return pack_uint(freq)
197   
198    def decode_frequency(self, valuestring):
199        return unpack_uint(valuestring)[0]
200   
201    def decode_weight(self, valuestring):
202        freq = unpack_uint(valuestring)[0]
203        return freq * self.field_boost
204   
205
206class DocBoosts(Frequency):
207    """A Field that stores frequency and per-document boost information for
208    each posting.
209   
210    Supports: frequency, weight.
211    """
212   
213    posting_size = _INT_SIZE + 1
214   
215    def word_values(self, value, doc_boost=1.0, **kwargs):
216        freqs = defaultdict(int)
217        weights = defaultdict(float)
218        for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
219            weights[t.text] += t.boost
220            freqs[t.text] += 1
221       
222        encode = self.encode
223        return ((w, freq, weights[w] * doc_boost, encode((freq, doc_boost)))
224                for w, freq in freqs.iteritems())
225   
226    def encode(self, freq_docboost):
227        freq, docboost = freq_docboost
228        return pack_uint(freq) + float_to_byte(docboost)
229   
230    def decode_docboosts(self, valuestring):
231        freq = unpack_uint(valuestring[:_INT_SIZE])[0]
232        docboost = byte_to_float(valuestring[-1])
233        return (freq, docboost)
234   
235    def decode_frequency(self, valuestring):
236        return unpack_uint(valuestring[0:_INT_SIZE])[0]
237   
238    def decode_weight(self, valuestring):
239        freq = unpack_uint(valuestring[:_INT_SIZE])[0]
240        docboost = byte_to_float(valuestring[-1])
241        return freq * docboost * self.field_boost
242   
243
244# Vector formats
245
246class Positions(Format):
247    """A vector that stores position information in each posting, to allow
248    phrase searching and "near" queries.
249   
250    Supports: frequency, weight, positions, position_boosts (always reports
251    position boost = 1.0).
252    """
253   
254    def word_values(self, value, start_pos=0, **kwargs):
255        poses = defaultdict(list)
256        weights = defaultdict(float)
257        for t in unstopped(self.analyzer(value, positions=True,
258                                         start_pos=start_pos, **kwargs)):
259            poses[t.text].append(start_pos + t.pos)
260            weights[t.text] += t.boost
261       
262        encode = self.encode
263        return ((w, len(poslist), weights[w], encode(poslist))
264                for w, poslist in poses.iteritems())
265   
266    def encode(self, positions):
267        codes = []
268        base = 0
269        for pos in positions:
270            codes.append(pos - base)
271            base = pos
272        return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
273   
274    def decode_positions(self, valuestring):
275        codes = loads(valuestring[_INT_SIZE:] + ".")
276        position = 0
277        positions = []
278        for code in codes:
279            position += code
280            positions.append(position)
281        return positions
282   
283    def decode_frequency(self, valuestring):
284        return unpack_uint(valuestring[:_INT_SIZE])[0]
285   
286    def decode_weight(self, valuestring):
287        return self.decode_frequency(valuestring) * self.field_boost
288   
289    def decode_position_boosts(self, valuestring):
290        return [(pos, 1) for pos in self.decode_positions(valuestring)]
291   
292
293class Characters(Positions):
294    """Stores token position and character start and end information for each
295    posting.
296   
297    Supports: frequency, weight, positions, position_boosts (always reports
298    position boost = 1.0), characters.
299    """
300   
301    def word_values(self, value, start_pos=0, start_char=0, **kwargs):
302        seen = defaultdict(list)
303       
304        for t in unstopped(self.analyzer(value, positions=True, chars=True,
305                                         start_pos=start_pos,
306                                         start_char=start_char, **kwargs)):
307            seen[t.text].append((t.pos, start_char + t.startchar,
308                                 start_char + t.endchar))
309       
310        encode = self.encode
311        return ((w, len(ls), float(len(ls)), encode(ls))
312                for w, ls in seen.iteritems())
313   
314    def encode(self, posns_chars):
315        # posns_chars = [(pos, startchar, endchar), ...]
316        codes = []
317        posbase = 0
318        charbase = 0
319        for pos, startchar, endchar in posns_chars:
320            codes.append((pos - posbase, startchar - charbase, endchar - startchar))
321            posbase = pos
322            charbase = endchar
323        return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
324   
325    def decode_characters(self, valuestring):
326        codes = loads(valuestring[_INT_SIZE:] + ".")
327        position = 0
328        endchar = 0
329        posns_chars = []
330        for code in codes:
331            position = code[0] + position
332            startchar = code[1] + endchar
333            endchar = code[2] + startchar
334            posns_chars.append((position, startchar, endchar))
335        return posns_chars
336   
337    def decode_positions(self, valuestring):
338        codes = loads(valuestring[_INT_SIZE:] + ".")
339        position = 0
340        posns = []
341        for code in codes:
342            position = code[0] + position
343            posns.append(position)
344        return posns
345   
346
347class PositionBoosts(Positions):
348    """A format that stores positions and per-position boost information
349    in each posting.
350   
351    Supports: frequency, weight, positions, position_boosts.
352    """
353   
354    def word_values(self, value, start_pos=0, **kwargs):
355        seen = defaultdict(iter)
356        for t in unstopped(self.analyzer(value, positions=True, boosts=True,
357                                         start_pos=start_pos, **kwargs)):
358            pos = t.pos
359            boost = t.boost
360            seen[t.text].append((pos, boost))
361       
362        encode = self.encode
363        return ((w, len(poslist), sum(p[1] for p in poslist), encode(poslist))
364                for w, poslist in seen.iteritems())
365   
366    def encode(self, posns_boosts):
367        # posns_boosts = [(pos, boost), ...]
368        codes = []
369        base = 0
370        summedboost = 0
371        for pos, boost in posns_boosts:
372            summedboost += boost
373            codes.append((pos - base, boost))
374            base = pos
375           
376        return (pack_uint(len(posns_boosts)) + pack_float(summedboost)
377                + dumps(codes, -1)[2:-1])
378       
379    def decode_position_boosts(self, valuestring):
380        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
381        position = 0
382        posns_boosts = []
383        for code in codes:
384            position = code[0] + position
385            posns_boosts.append((position, code[1]))
386        return posns_boosts
387   
388    def decode_positions(self, valuestring):
389        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
390        position = 0
391        posns = []
392        for code in codes:
393            position = code[0] + position
394            posns.append(position)
395        return posns
396   
397    def decode_weight(self, valuestring):
398        summedboost = unpack_float(valuestring[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0]
399        return summedboost
400   
401
402class CharacterBoosts(Characters):
403    """A format that stores positions, character start and end, and
404    per-position boost information in each posting.
405   
406    Supports: frequency, weight, positions, position_boosts, characters,
407    character_boosts.
408    """
409   
410    def word_values(self, value, start_pos=0, start_char=0, **kwargs):
411        seen = defaultdict(iter)
412        for t in unstopped(self.analyzer(value, positions=True,
413                                         characters=True, boosts=True,
414                                         start_pos=start_pos,
415                                         start_char=start_char, **kwargs)):
416            seen[t.text].append((t.pos,
417                                 start_char + t.startchar,
418                                 start_char + t.endchar,
419                                 t.boost))
420       
421        encode = self.encode
422        return ((w, len(poslist), sum(p[3] for p in poslist), encode(poslist))
423                for w, poslist in seen.iteritems())
424   
425    def encode(self, posns_chars_boosts):
426        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
427        codes = []
428        posbase = 0
429        charbase = 0
430        summedboost = 0
431        for pos, startchar, endchar, boost in posns_chars_boosts:
432            codes.append((pos - posbase, startchar - charbase,
433                          endchar - startchar, boost))
434            posbase = pos
435            charbase = endchar
436            summedboost += boost
437       
438        return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost)
439                + dumps(codes, -1)[2:-1])
440       
441    def decode_character_boosts(self, valuestring):
442        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".")
443        position = 0
444        endchar = 0
445        posn_char_boosts = []
446        for code in codes:
447            position = position + code[0]
448            startchar = endchar + code[1]
449            endchar = startchar + code[2]
450            posn_char_boosts.append((position, startchar, endchar, code[3]))
451        return posn_char_boosts
452   
453    def decode_positions(self, valuestring):
454        return [item[0] for item in self.decode_character_boosts(valuestring)]
455   
456    def decode_characters(self, valuestring):
457        return [(pos, startchar, endchar) for pos, startchar, endchar, _
458                in self.decode_character_boosts(valuestring)]
459   
460    def decode_position_boosts(self, valuestring):
461        return [(pos, boost) for pos, _, _, boost
462                in self.decode_character_boosts(valuestring)]
463
464
Note: See TracBrowser for help on using the browser.