root/branches/mbutscher/work/lib/whoosh/matching.py @ 231

Revision 231, 41.5 kB (checked in by mbutscher, 2 years ago)

branches/stable-2.0:
2.0rc10
* Bug fixed: Deprecated os.popen...() functions used

branches/mbutscher/work:
2.1beta10
* Menu item and toolbar icon to move up from subpage

to its superpage

* Bug fixed: Deprecated os.popen...() functions used

Line 
1#===============================================================================
2# Copyright 2010 Matt Chaput
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#    http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#===============================================================================
16
17from bisect import bisect_left, bisect_right, insort
18from itertools import izip, repeat
19
20
21"""
22This module contains "matcher" classes. Matchers deal with posting lists. The
23most basic matcher, which reads the list of postings for a term, will be
24provided by the backend implementation (for example,
25``whoosh.filedb.filepostings.FilePostingReader``). The classes in this module
26provide additional functionality, such as combining the results of two
27matchers, or modifying the results of a matcher.
28
29You do not need to deal with the classes in this module unless you need to
30write your own Matcher implementation to provide some new functionality. These
31classes are not instantiated by the user. They are usually created by a
32:class:`~whoosh.query.Query` object's ``matcher()`` method, which returns the
33appropriate matcher to implement the query (for example, the ``Or`` query's
34``matcher()`` method returns a ``UnionMatcher`` object).
35
36Certain backends support "quality" optimizations. These backends have the
37ability to skip ahead if it knows the current block of postings can't
38contribute to the top N documents. If the matcher tree and backend support
39these optimizations, the matcher's ``supports_quality()`` method will return
40``True``.
41"""
42
43
44class ReadTooFar(Exception):
45    """Raised when next() or skip_to() is called on an inactive matchers.
46    """
47
48
49class NoQualityAvailable(Exception):
50    """Raised when quality methods are called on a matcher that does not
51    support quality-based optimizations.
52    """
53
54
55# Matchers
56
57class Matcher(object):
58    """Base class for all matchers.
59    """
60   
61    def is_active(self):
62        """Returns True if this matcher is still "active", that is, it has not
63        yet reached the end of the posting list.
64        """
65       
66        raise NotImplementedError
67   
68    def replace(self):
69        """Returns a possibly-simplified version of this matcher. For example,
70        if one of the children of a UnionMatcher is no longer active, calling
71        this method on the UnionMatcher will return the other child.
72        """
73       
74        return self
75   
76    def copy(self):
77        """Returns a copy of this matcher.
78        """
79       
80        raise NotImplementedError
81   
82    def depth(self):
83        """Returns the depth of the tree under this matcher, or 0 if this
84        matcher does not have any children.
85        """
86       
87        return 0
88   
89    def supports_quality(self):
90        """Returns True if this matcher supports the use of ``quality`` and
91        ``block_quality``.
92        """
93       
94        return False
95   
96    def quality(self):
97        """Returns a quality measurement of the current posting, according to
98        the current weighting algorithm. Raises ``NoQualityAvailable`` if the
99        matcher or weighting do not support quality measurements.
100        """
101       
102        raise NoQualityAvailable
103   
104    def block_quality(self):
105        """Returns a quality measurement of the current block of postings,
106        according to the current weighting algorithm. Raises
107        ``NoQualityAvailable`` if the matcher or weighting do not support
108        quality measurements.
109        """
110       
111        raise NoQualityAvailable
112   
113    def id(self):
114        """Returns the ID of the current posting.
115        """
116       
117        raise NotImplementedError
118   
119    def all_ids(self):
120        """Returns a generator of all IDs in the matcher.
121       
122        What this method returns for a matcher that has already read some
123        postings (whether it only yields the remaining postings or all postings
124        from the beginning) is undefined, so it's best to only use this method
125        on fresh matchers.
126        """
127       
128        i = 0
129        while self.is_active():
130            yield self.id()
131            self.next()
132            i += 1
133            if i == 10:
134                self = self.replace()
135                i = 0
136               
137    def all_items(self):
138        """Returns a generator of all (ID, encoded value) pairs in the matcher.
139       
140        What this method returns for a matcher that has already read some
141        postings (whether it only yields the remaining postings or all postings
142        from the beginning) is undefined, so it's best to only use this method
143        on fresh matchers.
144        """
145       
146        i = 0
147        while self.is_active():
148            yield (self.id(), self.value())
149            self.next()
150            i += 1
151            if i == 10:
152                self = self.replace()
153                i = 0
154   
155    def items_as(self, astype):
156        """Returns a generator of all (ID, decoded value) pairs in the matcher.
157       
158        What this method returns for a matcher that has already read some
159        postings (whether it only yields the remaining postings or all postings
160        from the beginning) is undefined, so it's best to only use this method
161        on fresh matchers.
162        """
163       
164        while self.is_active():
165            yield (self.id(), self.value_as(astype))
166   
167    def value(self):
168        """Returns the encoded value of the current posting.
169        """
170       
171        raise NotImplementedError
172   
173    def supports(self, astype):
174        """Returns True if the field's format supports the named data type,
175        for example 'frequency' or 'characters'.
176        """
177       
178        raise NotImplementedError("supports not implemented in %s" % self.__class__)
179   
180    def value_as(self, astype):
181        """Returns the value(s) of the current posting as the given type.
182        """
183       
184        raise NotImplementedError("value_as not implemented in %s" % self.__class__)
185   
186    def spans(self):
187        """Returns a list of :class:`whoosh.spans.Span` objects for the matches
188        in this document. Raises an exception if the field being searched does
189        not store positions.
190        """
191       
192        from whoosh.spans import Span
193        if self.supports("characters"):
194            return [Span(pos, startchar=startchar, endchar=endchar)
195                    for pos, startchar, endchar in self.value_as("characters")]
196        elif self.supports("positions"):
197            return [Span(pos) for pos in self.value_as("positions")]
198        else:
199            raise Exception("Field does not support spans")
200       
201    def skip_to(self, id):
202        """Moves this matcher to the first posting with an ID equal to or
203        greater than the given ID.
204        """
205       
206        while self.is_active() and self.id() < id:
207            self.next()
208   
209    def skip_to_quality(self, minquality):
210        """Moves this matcher to the next block with greater than the given
211        minimum quality value.
212        """
213       
214        raise NotImplementedError
215   
216    def next(self):
217        """Moves this matcher to the next posting.
218        """
219       
220        raise NotImplementedError(self.__class__.__name__)
221   
222    def weight(self):
223        """Returns the weight of the current posting.
224        """
225       
226        return self.value_as("weight")
227   
228    def score(self):
229        """Returns the score of the current posting.
230        """
231       
232        raise NotImplementedError
233   
234
235class NullMatcher(Matcher):
236    """Matcher with no postings which is never active.
237    """
238   
239    def is_active(self):
240        return False
241   
242    def all_ids(self):
243        return []
244   
245    def copy(self):
246        return self
247   
248
249class ListMatcher(Matcher):
250    """Synthetic matcher backed by a list of IDs.
251    """
252   
253    def __init__(self, ids, weights=None, values=None, format=None,
254                 scorer=None, position=0):
255        """
256        :param ids: a list of doc IDs.
257        :param weights: a list of weights corresponding to the list of IDs.
258            If this argument is not supplied, a list of 1.0 values is used.
259        :param values: a list of encoded values corresponding to the list of
260            IDs.
261        :param format: a :class:`whoosh.formats.Format` object representing the
262            format of the field.
263        :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring
264            the postings.
265        """
266       
267        self._ids = ids
268        self._weights = weights
269        self._values = values
270        self._i = position
271        self._format = format
272        self._scorer = scorer
273   
274    def __repr__(self):
275        return "%s(%r, %r, %r, %d)" % (self.__class__.__name__, self._ids,
276                                       self._weights, self._values, self._i)
277   
278    def is_active(self):
279        return self._i < len(self._ids)
280   
281    def copy(self):
282        return self.__class__(self._ids, self._weights, self._values,
283                              self._format, self._scorer, self._i)
284   
285    def supports_quality(self):
286        return self._scorer is not None
287   
288    def quality(self):
289        return self._scorer.quality(self)
290   
291    def id(self):
292        return self._ids[self._i]
293   
294    def all_ids(self):
295        return iter(self._ids)
296   
297    def all_items(self):
298        values = self._values
299        if values is None:
300            values = repeat('')
301       
302        return izip(self._ids, values)
303   
304    def value(self):
305        if self._values:
306            return self._values[self._i]
307        else:
308            return ''
309   
310    def value_as(self, astype):
311        decoder = self._format.decoder(astype)
312        return decoder(self.value())
313   
314    def supports(self, astype):
315        return self._format.supports(astype)
316   
317    def next(self):
318        self._i += 1
319   
320    def weight(self):
321        if self._weights:
322            return self._weights[self._i]
323        else:
324            return 1.0
325   
326    def score(self):
327        if self._scorer:
328            return self._scorer.score(self)
329        else:
330            return self.weight()
331
332
333class WrappingMatcher(Matcher):
334    """Base class for matchers that wrap sub-matchers.
335    """
336   
337    def __init__(self, child, boost=1.0):
338        self.child = child
339        self.boost = boost
340   
341    def __repr__(self):
342        return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child, self.boost)
343   
344    def copy(self):
345        kwargs = {}
346        if hasattr(self, "boost"):
347            kwargs["boost"] = self.boost
348        return self.__class__(self.child.copy(), **kwargs)
349   
350    def depth(self):
351        return 1 + self.child.depth()
352   
353    def _replacement(self, newchild):
354        return self.__class__(newchild, boost=self.boost)
355   
356    def replace(self):
357        r = self.child.replace()
358        if not r.is_active():
359            return NullMatcher()
360        if r is not self.child:
361            try:
362                return self._replacement(r)
363            except TypeError, e:
364                raise TypeError("Class %s got exception %s trying "
365                                "to replace itself" % (self.__class__, e))
366        else:
367            return self
368   
369    def id(self):
370        return self.child.id()
371   
372    def all_ids(self):
373        return self.child.all_ids()
374   
375    def is_active(self):
376        return self.child.is_active()
377   
378    def supports(self, astype):
379        return self.child.supports(astype)
380   
381    def value(self):
382        return self.child.value()
383   
384    def value_as(self, astype):
385        return self.child.value_as(astype)
386   
387    def spans(self):
388        return self.child.spans()
389   
390    def skip_to(self, id):
391        return self.child.skip_to(id)
392   
393    def next(self):
394        self.child.next()
395   
396    def supports_quality(self):
397        return self.child.supports_quality()
398   
399    def skip_to_quality(self, minquality):
400        return self.child.skip_to_quality(minquality/self.boost)
401   
402    def quality(self):
403        return self.child.quality() * self.boost
404   
405    def block_quality(self):
406        return self.child.block_quality() * self.boost
407   
408    def weight(self):
409        return self.child.weight() * self.boost
410   
411    def score(self):
412        return self.child.score() * self.boost
413
414
415class MultiMatcher(Matcher):
416    """Serializes the results of a list of sub-matchers.
417    """
418   
419    def __init__(self, matchers, idoffsets, current=0):
420        """
421        :param matchers: a list of Matcher objects.
422        :param idoffsets: a list of offsets corresponding to items in the
423            ``matchers`` list.
424        """
425       
426        self.matchers = matchers
427        self.offsets = idoffsets
428        self.current = current
429        self._next_matcher()
430   
431    def __repr__(self):
432        return "%s(%r, %r, current=%s)" % (self.__class__.__name__,
433                                           self.matchers, self.offsets,
434                                           self.current)
435   
436    def is_active(self):
437        return self.current < len(self.matchers)
438   
439    def _next_matcher(self):
440        matchers = self.matchers
441        while self.current < len(matchers) and not matchers[self.current].is_active():
442            self.current += 1
443       
444    def copy(self):
445        return self.__class__([mr.copy() for mr in self.matchers[self.current:]],
446                              self.offsets[self.current:], current=self.current)
447   
448    def depth(self):
449        if self.is_active():
450            return 1 + max(mr.depth() for mr in self.matchers[self.current:])
451        else:
452            return 0
453   
454    def replace(self):
455        if not self.is_active():
456            return NullMatcher()
457        # TODO: Possible optimization: if the last matcher is current, replace
458        # this with the last matcher, but wrap it with a matcher that adds the
459        # offset. Have to check whether that's actually faster, though.
460        return self
461   
462    def id(self):
463        current = self.current
464        return self.matchers[current].id() + self.offsets[current]
465   
466    def all_ids(self):
467        offsets = self.offsets
468        for i, mr in enumerate(self.matchers):
469            for id in mr.all_ids():
470                yield id + offsets[i]
471   
472    def spans(self):
473        return self.matchers[self.current].spans()
474   
475    def supports(self, astype):
476        return self.matchers[self.current].supports(astype)
477   
478    def value(self):
479        return self.matchers[self.current].value()
480   
481    def value_as(self, astype):
482        return self.matchers[self.current].value_as(astype)
483   
484    def next(self):
485        if not self.is_active(): raise ReadTooFar
486       
487        self.matchers[self.current].next()
488        if not self.matchers[self.current].is_active():
489            self._next_matcher()
490       
491    def skip_to(self, id):
492        if not self.is_active(): raise ReadTooFar
493        if id <= self.id(): return
494       
495        matchers = self.matchers
496        offsets = self.offsets
497        r = False
498       
499        while self.current < len(matchers) and id > self.id():
500            mr = matchers[self.current]
501            sr = mr.skip_to(id - offsets[self.current])
502            r = sr or r
503            if mr.is_active():
504                break
505           
506            self._next_matcher()
507           
508        return r
509   
510    def supports_quality(self):
511        return all(mr.supports_quality() for mr in self.matchers[self.current:])
512   
513    def quality(self):
514        return self.matchers[self.current].quality()
515   
516    def block_quality(self):
517        return self.matchers[self.current].block_quality()
518   
519    def weight(self):
520        return self.matchers[self.current].weight()
521   
522    def score(self):
523        return self.matchers[self.current].score()
524
525
526class ExcludeMatcher(WrappingMatcher):
527    """Excludes a list of IDs from the postings returned by the wrapped
528    matcher.
529    """
530   
531    def __init__(self, child, excluded, boost=1.0):
532        super(ExcludeMatcher, self).__init__(child)
533        self.excluded = excluded
534        self.boost = boost
535        self._find_next()
536   
537    def __repr__(self):
538        return "%s(%r, %r, boost=%s)" % (self.__class__.__name__, self.child,
539                                         self.excluded, self.boost)
540   
541    def copy(self):
542        return self.__class__(self.child.copy(), self.excluded, boost=self.boost)
543   
544    def _replacement(self, newchild):
545        return self.__class__(newchild, self.excluded, boost=self.boost)
546   
547    def _find_next(self):
548        child = self.child
549        excluded = self.excluded
550        r = False
551        while child.is_active() and child.id() in excluded:
552            r = child.next() or r
553        return r
554   
555    def next(self):
556        self.child.next()
557        self._find_next()
558       
559    def skip_to(self, id):
560        self.child.skip_to(id)
561        self._find_next()
562       
563    def all_ids(self):
564        excluded = self.excluded
565        return (id for id in self.child.all_ids() if id not in excluded)
566   
567    def all_items(self):
568        excluded = self.excluded
569        return (item for item in self.child.all_items()
570                if item[0] not in excluded)
571
572
573class BiMatcher(Matcher):
574    """Base class for matchers that combine the results of two sub-matchers in
575    some way.
576    """
577   
578    def __init__(self, a, b):
579        super(BiMatcher, self).__init__()
580        self.a = a
581        self.b = b
582
583    def __repr__(self):
584        return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b)
585
586    def copy(self):
587        return self.__class__(self.a.copy(), self.b.copy())
588
589    def depth(self):
590        return 1 + max(self.a.depth(), self.b.depth())
591
592    def skip_to(self, id):
593        if not self.is_active(): raise ReadTooFar
594        ra = self.a.skip_to(id)
595        rb = self.b.skip_to(id)
596        return ra or rb
597       
598    def supports_quality(self):
599        return self.a.supports_quality() and self.b.supports_quality()
600   
601    def supports(self, astype):
602        return self.a.supports(astype) and self.b.supports(astype)
603   
604
605class AdditiveBiMatcher(BiMatcher):
606    """Base class for binary matchers where the scores of the sub-matchers are
607    added together.
608    """
609   
610    def quality(self):
611        q = 0.0
612        if self.a.is_active(): q += self.a.quality()
613        if self.b.is_active(): q += self.b.quality()
614        return q
615   
616    def block_quality(self):
617        bq = 0.0
618        if self.a.is_active(): bq += self.a.block_quality()
619        if self.b.is_active(): bq += self.b.block_quality()
620        return bq
621   
622    def weight(self):
623        return (self.a.weight() + self.b.weight())
624   
625    def score(self):
626        return (self.a.score() + self.b.score())
627   
628
629class UnionMatcher(AdditiveBiMatcher):
630    """Matches the union (OR) of the postings in the two sub-matchers.
631    """
632   
633    def replace(self):
634        a = self.a.replace()
635        b = self.b.replace()
636       
637        a_active = a.is_active()
638        b_active = b.is_active()
639        if not (a_active or b_active): return NullMatcher()
640        if not a_active:
641            return b
642        if not b_active:
643            return a
644       
645        if a is not self.a or b is not self.b:
646            return self.__class__(a, b)
647        return self
648   
649    def is_active(self):
650        return self.a.is_active() or self.b.is_active()
651   
652    def skip_to(self, id):
653        ra = rb = False
654        if self.a.is_active():
655            ra = self.a.skip_to(id)
656        if self.b.is_active():
657            rb = self.b.skip_to(id)
658        return ra or rb
659   
660    def id(self):
661        a = self.a
662        b = self.b
663        if not a.is_active(): return b.id()
664        if not b.is_active(): return a.id()
665        return min(a.id(), b.id())
666   
667    # Using sets is faster in most cases, but could potentially use a lot of
668    # memory
669    def all_ids(self):
670        return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids())))
671   
672    def next(self):
673        a = self.a
674        b = self.b
675        a_active = a.is_active()
676        b_active = b.is_active()
677       
678        # Shortcut when one matcher is inactive
679        if not (a_active or b_active):
680            raise ReadTooFar
681        elif not a_active:
682            return b.next()
683        elif not b_active:
684            return a.next()
685       
686        a_id = a.id()
687        b_id = b.id()
688        ar = br = None
689        if a_id <= b_id: ar = a.next()
690        if b_id <= a_id: br = b.next()
691        return ar or br
692   
693    def spans(self):
694        # Returns the branch that currently matches, or None if both branches
695        # currently match
696        if not self.a.is_active(): return self.b.spans()
697        if not self.b.is_active(): return self.a.spans()
698        id_a = self.a.id()
699        id_b = self.b.id()
700        if id_a < id_b:
701            return self.a.spans()
702        elif id_b < id_a:
703            return self.b.spans()
704        else:
705            return sorted(set(self.a.spans()) | set(self.b.spans()))
706   
707    def score(self):
708        a = self.a
709        b = self.b
710       
711        if not a.is_active(): return b.score()
712        if not b.is_active(): return a.score()
713       
714        id_a = a.id()
715        id_b = b.id()
716        if id_a < id_b:
717            return a.score()
718        elif id_b < id_a:
719            return b.score()
720        else:
721            return (a.score() + b.score())
722   
723    def skip_to_quality(self, minquality):
724        a = self.a
725        b = self.b
726        minquality = minquality
727       
728        # Short circuit if one matcher is inactive
729        if not a.is_active():
730            sk = b.skip_to_quality(minquality)
731            return sk
732        elif not b.is_active():
733            return a.skip_to_quality(minquality)
734       
735        skipped = 0
736        aq = a.block_quality()
737        bq = b.block_quality()
738        while a.is_active() and b.is_active() and aq + bq <= minquality:
739            if aq < bq:
740                skipped += a.skip_to_quality(minquality - bq)
741                aq = a.block_quality()
742            else:
743                skipped += b.skip_to_quality(minquality - aq)
744                bq = b.block_quality()
745               
746        return skipped
747       
748
749class DisjunctionMaxMatcher(UnionMatcher):
750    """Matches the union (OR) of two sub-matchers. Where both sub-matchers
751    match the same posting, returns the weight/score of the higher-scoring
752    posting.
753    """
754   
755    # TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher)
756    # but it does not add the scores of the sub-matchers together (it
757    # overrides all methods that perform addition). Need to clean up the
758    # inheritance.
759   
760    def __init__(self, a, b, tiebreak=0.0):
761        super(DisjunctionMaxMatcher, self).__init__(a, b)
762        self.tiebreak = tiebreak
763   
764    def copy(self):
765        return self.__class__(self.a.copy(), self.b.copy(),
766                              tiebreak=self.tiebreak)
767   
768    def score(self):
769        return max(self.a.score(), self.b.score())
770   
771    def quality(self):
772        return max(self.a.quality(), self.b.quality())
773   
774    def block_quality(self):
775        return max(self.a.block_quality(), self.b.block_quality())
776   
777    def skip_to_quality(self, minquality):
778        a = self.a
779        b = self.b
780        minquality = minquality
781       
782        # Short circuit if one matcher is inactive
783        if not a.is_active():
784            sk = b.skip_to_quality(minquality)
785            return sk
786        elif not b.is_active():
787            return a.skip_to_quality(minquality)
788       
789        skipped = 0
790        aq = a.block_quality()
791        bq = b.block_quality()
792        while a.is_active() and b.is_active() and max(aq, bq) <= minquality:
793            if aq <= minquality:
794                skipped += a.skip_to_quality(minquality)
795                aq = a.block_quality()
796            if bq <= minquality:
797                skipped += b.skip_to_quality(minquality)
798                bq = b.block_quality()
799        return skipped
800       
801
802class IntersectionMatcher(AdditiveBiMatcher):
803    """Matches the intersection (AND) of the postings in the two sub-matchers.
804    """
805   
806    def __init__(self, a, b):
807        super(IntersectionMatcher, self).__init__(a, b)
808        if (self.a.is_active()
809            and self.b.is_active()
810            and self.a.id() != self.b.id()):
811            self._find_next()
812   
813    def replace(self):
814        a = self.a.replace()
815        b = self.b.replace()
816       
817        a_active = a
818        b_active = b.is_active()
819        if not (a_active and b_active): return NullMatcher()
820       
821        if a is not self.a or b is not self.b:
822            return self.__class__(a, b)
823        return self
824   
825    def is_active(self):
826        return self.a.is_active() and self.b.is_active()
827   
828    def _find_next(self):
829        a = self.a
830        b = self.b
831        a_id = a.id()
832        b_id = b.id()
833        assert a_id != b_id
834        r = False
835       
836        while a.is_active() and b.is_active() and a_id != b_id:
837            if a_id < b_id:
838                ra = a.skip_to(b_id)
839                if not a.is_active(): return
840                r = r or ra
841                a_id = a.id()
842            else:
843                rb = b.skip_to(a_id)
844                if not b.is_active(): return
845                r = r or rb
846                b_id = b.id()
847        return r
848   
849    def id(self):
850        return self.a.id()
851   
852    # Using sets is faster in some cases, but could potentially use a lot of
853    # memory
854    def all_ids(self):
855        return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids())))
856   
857    def skip_to(self, id):
858        if not self.is_active(): raise ReadTooFar
859        ra = self.a.skip_to(id)
860        rb = self.b.skip_to(id)
861        if self.is_active():
862            rn = False
863            if self.a.id() != self.b.id():
864                rn = self._find_next()
865            return ra or rb or rn
866   
867    def skip_to_quality(self, minquality):
868        a = self.a
869        b = self.b
870        minquality = minquality
871       
872        skipped = 0
873        aq = a.block_quality()
874        bq = b.block_quality()
875        while a.is_active() and b.is_active() and aq + bq <= minquality:
876            if aq < bq:
877                skipped += a.skip_to_quality(minquality - bq)
878            else:
879                skipped += b.skip_to_quality(minquality - aq)
880            if a.id() != b.id():
881                self._find_next()
882            aq = a.block_quality()
883            bq = b.block_quality()
884        return skipped
885   
886    def next(self):
887        if not self.is_active(): raise ReadTooFar
888       
889        # We must assume that the ids are equal whenever next() is called (they
890        # should have been made equal by _find_next), so advance them both
891        ar = self.a.next()
892        if self.is_active():
893            nr = self._find_next()
894            return ar or nr
895   
896    def spans(self):
897        return sorted(set(self.a.spans()) | set(self.b.spans()))
898   
899
900class AndNotMatcher(BiMatcher):
901    """Matches the postings in the first sub-matcher that are NOT present in
902    the second sub-matcher.
903    """
904
905    def __init__(self, a, b):
906        super(AndNotMatcher, self).__init__(a, b)
907        if (self.a.is_active()
908            and self.b.is_active()
909            and self.a.id() != self.b.id()):
910            self._find_next()
911
912    def is_active(self):
913        return self.a.is_active()
914
915    def _find_next(self):
916        pos = self.a
917        neg = self.b
918        if not neg.is_active(): return
919        pos_id = pos.id()
920        r = False
921       
922        if neg.id() < pos_id:
923            neg.skip_to(pos_id)
924       
925        while neg.is_active() and pos_id == neg.id():
926            nr = pos.next()
927            r = r or nr
928            pos_id = pos.id()
929            neg.skip_to(pos_id)
930       
931        return r
932   
933    def replace(self):
934        if not self.a.is_active(): return NullMatcher()
935        if not self.b.is_active(): return self.a
936        return self
937   
938    def quality(self):
939        return self.a.quality()
940   
941    def block_quality(self):
942        return self.a.block_quality()
943   
944    def skip_to_quality(self, minquality):
945        skipped = self.a.skip_to_quality(minquality)
946        self._find_next()
947        return skipped
948   
949    def id(self):
950        return self.a.id()
951   
952    def all_ids(self):
953        return iter(sorted(set(self.a.all_ids()) - set(self.b.all_ids())))
954   
955    def next(self):
956        if not self.a.is_active(): raise ReadTooFar
957        ar = self.a.next()
958        nr = False
959        if self.b.is_active():
960            nr = self._find_next()
961        return ar or nr
962       
963    def skip_to(self, id):
964        if not self.a.is_active(): raise ReadTooFar
965        if id < self.a.id(): return
966       
967        self.a.skip_to(id)
968        if self.b.is_active():
969            self.b.skip_to(id)
970            self._find_next()
971   
972    def weight(self):
973        return self.a.weight()
974   
975    def score(self):
976        return self.a.score()
977   
978    def supports(self, astype):
979        return self.a.supports(astype)
980   
981    def value(self):
982        return self.a.value()
983   
984    def value_as(self, astype):
985        return self.a.value_as(astype)
986   
987
988class InverseMatcher(WrappingMatcher):
989    """Synthetic matcher, generates postings that are NOT present in the
990    wrapped matcher.
991    """
992   
993    def __init__(self, child, limit, missing=None, weight=1.0):
994        super(InverseMatcher, self).__init__(child)
995        self.limit = limit
996        self._weight = weight
997        self.missing = missing or (lambda id: False)
998        self._id = 0
999        self._find_next()
1000   
1001    def copy(self):
1002        return self.__class__(self.child.copy(), self.limit,
1003                              weight=self._weight, missing=self.missing)
1004   
1005    def _replacement(self, newchild):
1006        return self.__class__(newchild, self.limit, missing=self.missing,
1007                              weight=self.weight)
1008   
1009    def is_active(self):
1010        return self._id < self.limit
1011   
1012    def supports_quality(self):
1013        return False
1014   
1015    def _find_next(self):
1016        child = self.child
1017        missing = self.missing
1018       
1019        if not child.is_active() and not missing(self._id):
1020            return
1021       
1022        if child.is_active() and child.id() < self._id:
1023            child.skip_to(self._id)
1024       
1025        while (self._id < self.limit
1026               and ((child.is_active() and self._id == child.id())
1027                    or missing(self._id))):
1028            self._id += 1
1029            if child.is_active():
1030                child.next()
1031   
1032    def id(self):
1033        return self._id
1034   
1035    def all_ids(self):
1036        missing = self.missing
1037        negs = set(self.child.all_ids())
1038        return (id for id in xrange(self.limit)
1039                if id not in negs and not missing(id))
1040   
1041    def next(self):
1042        if self._id >= self.limit: raise ReadTooFar
1043        self._id += 1
1044        self._find_next()
1045       
1046    def skip_to(self, id):
1047        if self._id >= self.limit: raise ReadTooFar
1048        if id < self._id: return
1049        self._id = id
1050        self._find_next()
1051   
1052    def weight(self):
1053        return self._weight
1054   
1055    def score(self):
1056        return self._weight
1057
1058
1059class RequireMatcher(WrappingMatcher):
1060    """Matches postings that are in both sub-matchers, but only uses scores
1061    from the first.
1062    """
1063   
1064    def __init__(self, a, b):
1065        self.a = a
1066        self.b = b
1067        self.child = IntersectionMatcher(a, b)
1068   
1069    def copy(self):
1070        return self.__class__(self.a.copy(), self.b.copy())
1071   
1072    def replace(self):
1073        if not self.child.is_active(): return NullMatcher()
1074        return self
1075   
1076    def quality(self):
1077        return self.a.quality()
1078   
1079    def block_quality(self):
1080        return self.a.block_quality()
1081   
1082    def skip_to_quality(self, minquality):
1083        skipped = self.a.skip_to_quality(minquality)
1084        self.child._find_next()
1085        return skipped
1086   
1087    def weight(self):
1088        return self.a.weight()
1089   
1090    def score(self):
1091        return self.a.score()
1092   
1093    def supports(self, astype):
1094        return self.a.supports(astype)
1095   
1096    def value(self):
1097        return self.a.value()
1098       
1099    def value_as(self, astype):
1100        return self.a.value_as(astype)
1101   
1102
1103class AndMaybeMatcher(AdditiveBiMatcher):
1104    """Matches postings in the first sub-matcher, and if the same posting is
1105    in the second sub-matcher, adds their scores.
1106    """
1107   
1108    def __init__(self, a, b):
1109        self.a = a
1110        self.b = b
1111       
1112        if a.is_active() and b.is_active() and a.id() != b.id():
1113            b.skip_to(a.id())
1114   
1115    def is_active(self):
1116        return self.a.is_active()
1117   
1118    def id(self):
1119        return self.a.id()
1120   
1121    def next(self):
1122        if not self.a.is_active(): raise ReadTooFar
1123       
1124        ar = self.a.next()
1125        br = False
1126        if self.a.is_active() and self.b.is_active():
1127            br = self.b.skip_to(self.a.id())
1128        return ar or br
1129   
1130    def skip_to(self, id):
1131        if not self.a.is_active(): raise ReadTooFar
1132       
1133        ra = self.a.skip_to(id)
1134        rb = False
1135        if self.a.is_active() and self.b.is_active():
1136            rb = self.b.skip_to(id)
1137        return ra or rb
1138   
1139    def replace(self):
1140        ar = self.a.replace()
1141        br = self.b.replace()
1142        if not ar.is_active(): return NullMatcher()
1143        if not br.is_active(): return ar
1144        if ar is not self.a or br is not self.b:
1145            return self.__class__(ar, br)
1146        return self
1147   
1148    def skip_to_quality(self, minquality):
1149        a = self.a
1150        b = self.b
1151        minquality = minquality
1152       
1153        if not a.is_active(): raise ReadTooFar
1154        if not b.is_active():
1155            return a.skip_to_quality(minquality)
1156       
1157        skipped = 0
1158        aq = a.block_quality()
1159        bq = b.block_quality()
1160        while a.is_active() and b.is_active() and aq + bq <= minquality:
1161            if aq < bq:
1162                skipped += a.skip_to_quality(minquality - bq)
1163                aq = a.block_quality()
1164            else:
1165                skipped += b.skip_to_quality(minquality - aq)
1166                bq = b.block_quality()
1167       
1168        return skipped
1169   
1170    def weight(self):
1171        if self.a.id() == self.b.id():
1172            return self.a.weight() + self.b.weight()
1173        else:
1174            return self.a.weight()
1175   
1176    def score(self):
1177        if self.b.is_active() and self.a.id() == self.b.id():
1178            return self.a.score() + self.b.score()
1179        else:
1180            return self.a.score()
1181       
1182    def supports(self, astype):
1183        return self.a.supports(astype)
1184   
1185    def value(self):
1186        return self.a.value()
1187       
1188    def value_as(self, astype):
1189        return self.a.value_as(astype)
1190
1191
1192class ConstantScoreMatcher(WrappingMatcher):
1193    def __init__(self, child, score=1.0):
1194        super(ConstantScoreMatcher, self).__init__(child)
1195        self._score = score
1196   
1197    def copy(self):
1198        return self.__class__(self.child.copy(), score=self._score)
1199   
1200    def _replacement(self, newchild):
1201        return self.__class__(newchild, score=self._score)
1202   
1203    def quality(self):
1204        return self._score
1205   
1206    def block_quality(self):
1207        return self._score
1208   
1209    def score(self):
1210        return self._score
1211   
1212
1213
1214#class PhraseMatcher(WrappingMatcher):
1215#    """Matches postings where a list of sub-matchers occur next to each other
1216#    in order.
1217#    """
1218#   
1219#    def __init__(self, wordmatchers, slop=1, boost=1.0):
1220#        self.wordmatchers = wordmatchers
1221#        self.child = make_binary_tree(IntersectionMatcher, wordmatchers)
1222#        self.slop = slop
1223#        self.boost = boost
1224#        self._spans = None
1225#        self._find_next()
1226#   
1227#    def copy(self):
1228#        return self.__class__(self.wordmatchers[:], slop=self.slop, boost=self.boost)
1229#   
1230#    def replace(self):
1231#        if not self.is_active():
1232#            return NullMatcher()
1233#        return self
1234#   
1235#    def all_ids(self):
1236#        # Need to redefine this because the WrappingMatcher parent class
1237#        # forwards to the submatcher, which in this case is just the
1238#        # IntersectionMatcher.
1239#        while self.is_active():
1240#            yield self.id()
1241#            self.next()
1242#   
1243#    def next(self):
1244#        ri = self.child.next()
1245#        rn = self._find_next()
1246#        return ri or rn
1247#   
1248#    def skip_to(self, id):
1249#        rs = self.child.skip_to(id)
1250#        rn = self._find_next()
1251#        return rs or rn
1252#   
1253#    def skip_to_quality(self, minquality):
1254#        skipped = 0
1255#        while self.is_active() and self.quality() <= minquality:
1256#            # TODO: doesn't count the documents matching the phrase yet
1257#            skipped += self.child.skip_to_quality(minquality/self.boost)
1258#            self._find_next()
1259#        return skipped
1260#   
1261#    def positions(self):
1262#        if not self.is_active():
1263#            raise ReadTooFar
1264#        if not self.wordmatchers:
1265#            return []
1266#        return self.wordmatchers[0].positions()
1267#   
1268#    def _find_next(self):
1269#        isect = self.child
1270#        slop = self.slop
1271#       
1272#        # List of "active" positions
1273#        current = []
1274#       
1275#        while not current and isect.is_active():
1276#            # [[list of positions for word 1],
1277#            #  [list of positions for word 2], ...]
1278#            poses = [m.positions() for m in self.wordmatchers]
1279#           
1280#            # Set the "active" position list to the list of positions of the
1281#            # first word. We well then iteratively update this list with the
1282#            # positions of subsequent words if they are within the "slop"
1283#            # distance of the positions in the active list.
1284#            current = poses[0]
1285#           
1286#            # For each list of positions for the subsequent words...
1287#            for poslist in poses[1:]:
1288#                # A list to hold the new list of active positions
1289#                newposes = []
1290#               
1291#                # For each position in the list of positions in this next word
1292#                for newpos in poslist:
1293#                    # Use bisect to only check the part of the current list
1294#                    # that could contain positions within the "slop" distance
1295#                    # of the new position
1296#                    start = bisect_left(current, newpos - slop)
1297#                    end = bisect_right(current, newpos)
1298#                   
1299#                    #
1300#                    for curpos in current[start:end]:
1301#                        delta = newpos - curpos
1302#                        if delta > 0 and delta <= slop:
1303#                            newposes.append(newpos)
1304#                   
1305#                current = newposes
1306#                if not current: break
1307#           
1308#            if not current:
1309#                isect.next()
1310#       
1311#        self._count = len(current)
1312#
1313#
1314#class VectorPhraseMatcher(BasePhraseMatcher):
1315#    """Phrase matcher for fields with a vector with positions (i.e. Positions
1316#    or CharacterPositions format).
1317#    """
1318#   
1319#    def __init__(self, searcher, fieldname, words, isect, slop=1, boost=1.0):
1320#        """
1321#        :param searcher: a Searcher object.
1322#        :param fieldname: the field in which to search.
1323#        :param words: a sequence of token texts representing the words in the
1324#            phrase.
1325#        :param isect: an intersection matcher for the words in the phrase.
1326#        :param slop:
1327#        """
1328#       
1329#        decodefn = searcher.field(fieldname).vector.decoder("positions")
1330#        self.reader = searcher.reader()
1331#        self.fieldname = fieldname
1332#        self.words = words
1333#        self.sortedwords = sorted(self.words)
1334#        super(VectorPhraseMatcher, self).__init__(isect, decodefn, slop=slop,
1335#                                                  boost=boost)
1336#   
1337#    def _poses(self):
1338#        vreader = self.reader.vector(self.child.id(), self.fieldname)
1339#        poses = {}
1340#        decode_positions = self.decode_positions
1341#        for word in self.sortedwords:
1342#            vreader.skip_to(word)
1343#            if vreader.id() != word:
1344#                raise Exception("Phrase query: %r in term index but not in"
1345#                                " vector (possible analyzer mismatch)" % word)
1346#            poses[word] = decode_positions(vreader.value())
1347#        # Now put the position lists in phrase order
1348#        return [poses[word] for word in self.words]
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
Note: See TracBrowser for help on using the browser.