| 1 | |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | |
|---|
| 5 | |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | |
|---|
| 9 | |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | """ |
|---|
| 18 | The classes in this module encode and decode posting information for a field. |
|---|
| 19 | The field format essentially determines what information is stored about each |
|---|
| 20 | occurance of a term. |
|---|
| 21 | """ |
|---|
| 22 | |
|---|
| 23 | from collections import defaultdict |
|---|
| 24 | from struct import Struct |
|---|
| 25 | from cPickle import dumps, loads |
|---|
| 26 | from cStringIO import StringIO |
|---|
| 27 | |
|---|
| 28 | from whoosh.analysis import unstopped |
|---|
| 29 | from whoosh.system import (_INT_SIZE, _FLOAT_SIZE, pack_uint, unpack_uint, |
|---|
| 30 | pack_float, unpack_float) |
|---|
| 31 | from whoosh.util import varint, read_varint, float_to_byte, byte_to_float |
|---|
| 32 | |
|---|
| 33 | |
|---|
| 34 | |
|---|
| 35 | |
|---|
| 36 | class Format(object): |
|---|
| 37 | """Abstract base class representing a storage format for a field or vector. |
|---|
| 38 | Format objects are responsible for writing and reading the low-level |
|---|
| 39 | representation of a field. It controls what kind/level of information to |
|---|
| 40 | store about the indexed fields. |
|---|
| 41 | """ |
|---|
| 42 | |
|---|
| 43 | posting_size = -1 |
|---|
| 44 | textual = True |
|---|
| 45 | __inittypes__ = dict(analyzer=object, field_boost=float) |
|---|
| 46 | |
|---|
| 47 | def __init__(self, analyzer, field_boost=1.0, **options): |
|---|
| 48 | """ |
|---|
| 49 | :param analyzer: The analysis.Analyzer object to use to index this |
|---|
| 50 | field. See the analysis module for more information. If this value |
|---|
| 51 | is None, the field is not indexed/searchable. |
|---|
| 52 | :param field_boost: A constant boost factor to scale to the score |
|---|
| 53 | of all queries matching terms in this field. |
|---|
| 54 | """ |
|---|
| 55 | |
|---|
| 56 | self.analyzer = analyzer |
|---|
| 57 | self.field_boost = field_boost |
|---|
| 58 | self.options = options |
|---|
| 59 | |
|---|
| 60 | def __eq__(self, other): |
|---|
| 61 | return (other |
|---|
| 62 | and self.__class__ is other.__class__ |
|---|
| 63 | and self.__dict__ == other.__dict__) |
|---|
| 64 | |
|---|
| 65 | def __repr__(self): |
|---|
| 66 | return "%s(%r, boost = %s)" % (self.__class__.__name__, |
|---|
| 67 | self.analyzer, self.field_boost) |
|---|
| 68 | |
|---|
| 69 | def clean(self): |
|---|
| 70 | if self.analyzer and hasattr(self.analyzer, "clean"): |
|---|
| 71 | self.analyzer.clean() |
|---|
| 72 | |
|---|
| 73 | def word_values(self, value, **kwargs): |
|---|
| 74 | """Takes the text value to be indexed and yields a series of |
|---|
| 75 | ("tokentext", frequency, weight, valuestring) tuples, where frequency |
|---|
| 76 | is the number of times "tokentext" appeared in the value, weight is the |
|---|
| 77 | weight (a float usually equal to frequency in the absence of per-term |
|---|
| 78 | boosts) and valuestring is encoded field-specific posting value for the |
|---|
| 79 | token. For example, in a Frequency format, the value string would be |
|---|
| 80 | the same as frequency; in a Positions format, the value string would |
|---|
| 81 | encode a list of token positions at which "tokentext" occured. |
|---|
| 82 | |
|---|
| 83 | :param value: The unicode text to index. |
|---|
| 84 | """ |
|---|
| 85 | raise NotImplementedError |
|---|
| 86 | |
|---|
| 87 | def analyze(self, unicodestring, mode='', **kwargs): |
|---|
| 88 | """Returns a :class:`whoosh.analysis.Token` iterator from the given |
|---|
| 89 | unicode string. |
|---|
| 90 | |
|---|
| 91 | :param unicodestring: the string to analyzer. |
|---|
| 92 | :param mode: a string indicating the purpose for which the unicode |
|---|
| 93 | string is being analyzed, i.e. 'index' or 'query'. |
|---|
| 94 | """ |
|---|
| 95 | |
|---|
| 96 | if not self.analyzer: |
|---|
| 97 | raise Exception("%s format has no analyzer" % self.__class__) |
|---|
| 98 | return self.analyzer(unicodestring, mode=mode, **kwargs) |
|---|
| 99 | |
|---|
| 100 | def encode(self, value): |
|---|
| 101 | """Returns the given value encoded as a string. |
|---|
| 102 | """ |
|---|
| 103 | raise NotImplementedError |
|---|
| 104 | |
|---|
| 105 | def supports(self, name): |
|---|
| 106 | """Returns True if this format supports interpreting its posting |
|---|
| 107 | value as 'name' (e.g. "frequency" or "positions"). |
|---|
| 108 | """ |
|---|
| 109 | return hasattr(self, "decode_" + name) |
|---|
| 110 | |
|---|
| 111 | def decoder(self, name): |
|---|
| 112 | """Returns the bound method for interpreting value as 'name', |
|---|
| 113 | where 'name' is for example "frequency" or "positions". This |
|---|
| 114 | object must have a corresponding Format.decode_<name>() method. |
|---|
| 115 | """ |
|---|
| 116 | return getattr(self, "decode_" + name) |
|---|
| 117 | |
|---|
| 118 | def decode_as(self, astype, valuestring): |
|---|
| 119 | """Interprets the encoded value string as 'astype', where 'astype' is |
|---|
| 120 | for example "frequency" or "positions". This object must have a |
|---|
| 121 | corresponding decode_<astype>() method. |
|---|
| 122 | """ |
|---|
| 123 | return self.decoder(astype)(valuestring) |
|---|
| 124 | |
|---|
| 125 | |
|---|
| 126 | |
|---|
| 127 | |
|---|
| 128 | class Existence(Format): |
|---|
| 129 | """Only indexes whether a given term occurred in a given document; it does |
|---|
| 130 | not store frequencies or positions. This is useful for fields that should |
|---|
| 131 | be searchable but not scorable, such as file path. |
|---|
| 132 | |
|---|
| 133 | Supports: frequency, weight (always reports frequency = 1). |
|---|
| 134 | """ |
|---|
| 135 | |
|---|
| 136 | posting_size = 0 |
|---|
| 137 | __inittypes__ = dict(analyzer=object, field_boost=float) |
|---|
| 138 | |
|---|
| 139 | def __init__(self, analyzer, field_boost=1.0, **options): |
|---|
| 140 | self.analyzer = analyzer |
|---|
| 141 | self.field_boost = field_boost |
|---|
| 142 | self.options = options |
|---|
| 143 | |
|---|
| 144 | def word_values(self, value, **kwargs): |
|---|
| 145 | wordset = set(t.text for t |
|---|
| 146 | in unstopped(self.analyzer(value, **kwargs))) |
|---|
| 147 | return ((w, 1, 1.0, '') for w in wordset) |
|---|
| 148 | |
|---|
| 149 | def encode(self, value): |
|---|
| 150 | return '' |
|---|
| 151 | |
|---|
| 152 | def decode_frequency(self, valuestring): |
|---|
| 153 | return 1 |
|---|
| 154 | |
|---|
| 155 | def decode_weight(self, valuestring): |
|---|
| 156 | return self.field_boost |
|---|
| 157 | |
|---|
| 158 | |
|---|
| 159 | class Frequency(Format): |
|---|
| 160 | """Stores frequency information for each posting. |
|---|
| 161 | |
|---|
| 162 | Supports: frequency, weight. |
|---|
| 163 | """ |
|---|
| 164 | |
|---|
| 165 | posting_size = _INT_SIZE |
|---|
| 166 | __inittypes__ = dict(analyzer=object, field_boost=float, |
|---|
| 167 | boost_as_freq=bool) |
|---|
| 168 | |
|---|
| 169 | def __init__(self, analyzer, field_boost=1.0, boost_as_freq=False, |
|---|
| 170 | **options): |
|---|
| 171 | """ |
|---|
| 172 | :param analyzer: The analysis.Analyzer object to use to index this |
|---|
| 173 | field. See the analysis module for more information. If this value |
|---|
| 174 | is None, the field is not indexed/searchable. |
|---|
| 175 | :param field_boost: A constant boost factor to scale to the score of |
|---|
| 176 | all queries matching terms in this field. |
|---|
| 177 | """ |
|---|
| 178 | |
|---|
| 179 | self.analyzer = analyzer |
|---|
| 180 | self.field_boost = field_boost |
|---|
| 181 | self.options = options |
|---|
| 182 | |
|---|
| 183 | def word_values(self, value, **kwargs): |
|---|
| 184 | freqs = defaultdict(int) |
|---|
| 185 | weights = defaultdict(float) |
|---|
| 186 | |
|---|
| 187 | for t in unstopped(self.analyzer(value, boosts=True, **kwargs)): |
|---|
| 188 | freqs[t.text] += 1 |
|---|
| 189 | weights[t.text] += t.boost |
|---|
| 190 | |
|---|
| 191 | encode = self.encode |
|---|
| 192 | return ((w, freq, weights[w], encode(freq)) |
|---|
| 193 | for w, freq in freqs.iteritems()) |
|---|
| 194 | |
|---|
| 195 | def encode(self, freq): |
|---|
| 196 | return pack_uint(freq) |
|---|
| 197 | |
|---|
| 198 | def decode_frequency(self, valuestring): |
|---|
| 199 | return unpack_uint(valuestring)[0] |
|---|
| 200 | |
|---|
| 201 | def decode_weight(self, valuestring): |
|---|
| 202 | freq = unpack_uint(valuestring)[0] |
|---|
| 203 | return freq * self.field_boost |
|---|
| 204 | |
|---|
| 205 | |
|---|
| 206 | class DocBoosts(Frequency): |
|---|
| 207 | """A Field that stores frequency and per-document boost information for |
|---|
| 208 | each posting. |
|---|
| 209 | |
|---|
| 210 | Supports: frequency, weight. |
|---|
| 211 | """ |
|---|
| 212 | |
|---|
| 213 | posting_size = _INT_SIZE + 1 |
|---|
| 214 | |
|---|
| 215 | def word_values(self, value, doc_boost=1.0, **kwargs): |
|---|
| 216 | freqs = defaultdict(int) |
|---|
| 217 | weights = defaultdict(float) |
|---|
| 218 | for t in unstopped(self.analyzer(value, boosts=True, **kwargs)): |
|---|
| 219 | weights[t.text] += t.boost |
|---|
| 220 | freqs[t.text] += 1 |
|---|
| 221 | |
|---|
| 222 | encode = self.encode |
|---|
| 223 | return ((w, freq, weights[w] * doc_boost, encode((freq, doc_boost))) |
|---|
| 224 | for w, freq in freqs.iteritems()) |
|---|
| 225 | |
|---|
| 226 | def encode(self, freq_docboost): |
|---|
| 227 | freq, docboost = freq_docboost |
|---|
| 228 | return pack_uint(freq) + float_to_byte(docboost) |
|---|
| 229 | |
|---|
| 230 | def decode_docboosts(self, valuestring): |
|---|
| 231 | freq = unpack_uint(valuestring[:_INT_SIZE])[0] |
|---|
| 232 | docboost = byte_to_float(valuestring[-1]) |
|---|
| 233 | return (freq, docboost) |
|---|
| 234 | |
|---|
| 235 | def decode_frequency(self, valuestring): |
|---|
| 236 | return unpack_uint(valuestring[0:_INT_SIZE])[0] |
|---|
| 237 | |
|---|
| 238 | def decode_weight(self, valuestring): |
|---|
| 239 | freq = unpack_uint(valuestring[:_INT_SIZE])[0] |
|---|
| 240 | docboost = byte_to_float(valuestring[-1]) |
|---|
| 241 | return freq * docboost * self.field_boost |
|---|
| 242 | |
|---|
| 243 | |
|---|
| 244 | |
|---|
| 245 | |
|---|
| 246 | class Positions(Format): |
|---|
| 247 | """A vector that stores position information in each posting, to allow |
|---|
| 248 | phrase searching and "near" queries. |
|---|
| 249 | |
|---|
| 250 | Supports: frequency, weight, positions, position_boosts (always reports |
|---|
| 251 | position boost = 1.0). |
|---|
| 252 | """ |
|---|
| 253 | |
|---|
| 254 | def word_values(self, value, start_pos=0, **kwargs): |
|---|
| 255 | poses = defaultdict(list) |
|---|
| 256 | weights = defaultdict(float) |
|---|
| 257 | for t in unstopped(self.analyzer(value, positions=True, |
|---|
| 258 | start_pos=start_pos, **kwargs)): |
|---|
| 259 | poses[t.text].append(start_pos + t.pos) |
|---|
| 260 | weights[t.text] += t.boost |
|---|
| 261 | |
|---|
| 262 | encode = self.encode |
|---|
| 263 | return ((w, len(poslist), weights[w], encode(poslist)) |
|---|
| 264 | for w, poslist in poses.iteritems()) |
|---|
| 265 | |
|---|
| 266 | def encode(self, positions): |
|---|
| 267 | codes = [] |
|---|
| 268 | base = 0 |
|---|
| 269 | for pos in positions: |
|---|
| 270 | codes.append(pos - base) |
|---|
| 271 | base = pos |
|---|
| 272 | return pack_uint(len(codes)) + dumps(codes, -1)[2:-1] |
|---|
| 273 | |
|---|
| 274 | def decode_positions(self, valuestring): |
|---|
| 275 | codes = loads(valuestring[_INT_SIZE:] + ".") |
|---|
| 276 | position = 0 |
|---|
| 277 | positions = [] |
|---|
| 278 | for code in codes: |
|---|
| 279 | position += code |
|---|
| 280 | positions.append(position) |
|---|
| 281 | return positions |
|---|
| 282 | |
|---|
| 283 | def decode_frequency(self, valuestring): |
|---|
| 284 | return unpack_uint(valuestring[:_INT_SIZE])[0] |
|---|
| 285 | |
|---|
| 286 | def decode_weight(self, valuestring): |
|---|
| 287 | return self.decode_frequency(valuestring) * self.field_boost |
|---|
| 288 | |
|---|
| 289 | def decode_position_boosts(self, valuestring): |
|---|
| 290 | return [(pos, 1) for pos in self.decode_positions(valuestring)] |
|---|
| 291 | |
|---|
| 292 | |
|---|
| 293 | class Characters(Positions): |
|---|
| 294 | """Stores token position and character start and end information for each |
|---|
| 295 | posting. |
|---|
| 296 | |
|---|
| 297 | Supports: frequency, weight, positions, position_boosts (always reports |
|---|
| 298 | position boost = 1.0), characters. |
|---|
| 299 | """ |
|---|
| 300 | |
|---|
| 301 | def word_values(self, value, start_pos=0, start_char=0, **kwargs): |
|---|
| 302 | seen = defaultdict(list) |
|---|
| 303 | |
|---|
| 304 | for t in unstopped(self.analyzer(value, positions=True, chars=True, |
|---|
| 305 | start_pos=start_pos, |
|---|
| 306 | start_char=start_char, **kwargs)): |
|---|
| 307 | seen[t.text].append((t.pos, start_char + t.startchar, |
|---|
| 308 | start_char + t.endchar)) |
|---|
| 309 | |
|---|
| 310 | encode = self.encode |
|---|
| 311 | return ((w, len(ls), float(len(ls)), encode(ls)) |
|---|
| 312 | for w, ls in seen.iteritems()) |
|---|
| 313 | |
|---|
| 314 | def encode(self, posns_chars): |
|---|
| 315 | |
|---|
| 316 | codes = [] |
|---|
| 317 | posbase = 0 |
|---|
| 318 | charbase = 0 |
|---|
| 319 | for pos, startchar, endchar in posns_chars: |
|---|
| 320 | codes.append((pos - posbase, startchar - charbase, endchar - startchar)) |
|---|
| 321 | posbase = pos |
|---|
| 322 | charbase = endchar |
|---|
| 323 | return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1] |
|---|
| 324 | |
|---|
| 325 | def decode_characters(self, valuestring): |
|---|
| 326 | codes = loads(valuestring[_INT_SIZE:] + ".") |
|---|
| 327 | position = 0 |
|---|
| 328 | endchar = 0 |
|---|
| 329 | posns_chars = [] |
|---|
| 330 | for code in codes: |
|---|
| 331 | position = code[0] + position |
|---|
| 332 | startchar = code[1] + endchar |
|---|
| 333 | endchar = code[2] + startchar |
|---|
| 334 | posns_chars.append((position, startchar, endchar)) |
|---|
| 335 | return posns_chars |
|---|
| 336 | |
|---|
| 337 | def decode_positions(self, valuestring): |
|---|
| 338 | codes = loads(valuestring[_INT_SIZE:] + ".") |
|---|
| 339 | position = 0 |
|---|
| 340 | posns = [] |
|---|
| 341 | for code in codes: |
|---|
| 342 | position = code[0] + position |
|---|
| 343 | posns.append(position) |
|---|
| 344 | return posns |
|---|
| 345 | |
|---|
| 346 | |
|---|
| 347 | class PositionBoosts(Positions): |
|---|
| 348 | """A format that stores positions and per-position boost information |
|---|
| 349 | in each posting. |
|---|
| 350 | |
|---|
| 351 | Supports: frequency, weight, positions, position_boosts. |
|---|
| 352 | """ |
|---|
| 353 | |
|---|
| 354 | def word_values(self, value, start_pos=0, **kwargs): |
|---|
| 355 | seen = defaultdict(iter) |
|---|
| 356 | for t in unstopped(self.analyzer(value, positions=True, boosts=True, |
|---|
| 357 | start_pos=start_pos, **kwargs)): |
|---|
| 358 | pos = t.pos |
|---|
| 359 | boost = t.boost |
|---|
| 360 | seen[t.text].append((pos, boost)) |
|---|
| 361 | |
|---|
| 362 | encode = self.encode |
|---|
| 363 | return ((w, len(poslist), sum(p[1] for p in poslist), encode(poslist)) |
|---|
| 364 | for w, poslist in seen.iteritems()) |
|---|
| 365 | |
|---|
| 366 | def encode(self, posns_boosts): |
|---|
| 367 | |
|---|
| 368 | codes = [] |
|---|
| 369 | base = 0 |
|---|
| 370 | summedboost = 0 |
|---|
| 371 | for pos, boost in posns_boosts: |
|---|
| 372 | summedboost += boost |
|---|
| 373 | codes.append((pos - base, boost)) |
|---|
| 374 | base = pos |
|---|
| 375 | |
|---|
| 376 | return (pack_uint(len(posns_boosts)) + pack_float(summedboost) |
|---|
| 377 | + dumps(codes, -1)[2:-1]) |
|---|
| 378 | |
|---|
| 379 | def decode_position_boosts(self, valuestring): |
|---|
| 380 | codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".") |
|---|
| 381 | position = 0 |
|---|
| 382 | posns_boosts = [] |
|---|
| 383 | for code in codes: |
|---|
| 384 | position = code[0] + position |
|---|
| 385 | posns_boosts.append((position, code[1])) |
|---|
| 386 | return posns_boosts |
|---|
| 387 | |
|---|
| 388 | def decode_positions(self, valuestring): |
|---|
| 389 | codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".") |
|---|
| 390 | position = 0 |
|---|
| 391 | posns = [] |
|---|
| 392 | for code in codes: |
|---|
| 393 | position = code[0] + position |
|---|
| 394 | posns.append(position) |
|---|
| 395 | return posns |
|---|
| 396 | |
|---|
| 397 | def decode_weight(self, valuestring): |
|---|
| 398 | summedboost = unpack_float(valuestring[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0] |
|---|
| 399 | return summedboost |
|---|
| 400 | |
|---|
| 401 | |
|---|
| 402 | class CharacterBoosts(Characters): |
|---|
| 403 | """A format that stores positions, character start and end, and |
|---|
| 404 | per-position boost information in each posting. |
|---|
| 405 | |
|---|
| 406 | Supports: frequency, weight, positions, position_boosts, characters, |
|---|
| 407 | character_boosts. |
|---|
| 408 | """ |
|---|
| 409 | |
|---|
| 410 | def word_values(self, value, start_pos=0, start_char=0, **kwargs): |
|---|
| 411 | seen = defaultdict(iter) |
|---|
| 412 | for t in unstopped(self.analyzer(value, positions=True, |
|---|
| 413 | characters=True, boosts=True, |
|---|
| 414 | start_pos=start_pos, |
|---|
| 415 | start_char=start_char, **kwargs)): |
|---|
| 416 | seen[t.text].append((t.pos, |
|---|
| 417 | start_char + t.startchar, |
|---|
| 418 | start_char + t.endchar, |
|---|
| 419 | t.boost)) |
|---|
| 420 | |
|---|
| 421 | encode = self.encode |
|---|
| 422 | return ((w, len(poslist), sum(p[3] for p in poslist), encode(poslist)) |
|---|
| 423 | for w, poslist in seen.iteritems()) |
|---|
| 424 | |
|---|
| 425 | def encode(self, posns_chars_boosts): |
|---|
| 426 | |
|---|
| 427 | codes = [] |
|---|
| 428 | posbase = 0 |
|---|
| 429 | charbase = 0 |
|---|
| 430 | summedboost = 0 |
|---|
| 431 | for pos, startchar, endchar, boost in posns_chars_boosts: |
|---|
| 432 | codes.append((pos - posbase, startchar - charbase, |
|---|
| 433 | endchar - startchar, boost)) |
|---|
| 434 | posbase = pos |
|---|
| 435 | charbase = endchar |
|---|
| 436 | summedboost += boost |
|---|
| 437 | |
|---|
| 438 | return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost) |
|---|
| 439 | + dumps(codes, -1)[2:-1]) |
|---|
| 440 | |
|---|
| 441 | def decode_character_boosts(self, valuestring): |
|---|
| 442 | codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:] + ".") |
|---|
| 443 | position = 0 |
|---|
| 444 | endchar = 0 |
|---|
| 445 | posn_char_boosts = [] |
|---|
| 446 | for code in codes: |
|---|
| 447 | position = position + code[0] |
|---|
| 448 | startchar = endchar + code[1] |
|---|
| 449 | endchar = startchar + code[2] |
|---|
| 450 | posn_char_boosts.append((position, startchar, endchar, code[3])) |
|---|
| 451 | return posn_char_boosts |
|---|
| 452 | |
|---|
| 453 | def decode_positions(self, valuestring): |
|---|
| 454 | return [item[0] for item in self.decode_character_boosts(valuestring)] |
|---|
| 455 | |
|---|
| 456 | def decode_characters(self, valuestring): |
|---|
| 457 | return [(pos, startchar, endchar) for pos, startchar, endchar, _ |
|---|
| 458 | in self.decode_character_boosts(valuestring)] |
|---|
| 459 | |
|---|
| 460 | def decode_position_boosts(self, valuestring): |
|---|
| 461 | return [(pos, boost) for pos, _, _, boost |
|---|
| 462 | in self.decode_character_boosts(valuestring)] |
|---|
| 463 | |
|---|
| 464 | |
|---|