Changeset 234


Ignore:
Timestamp:
Jan 9, 2011 7:37:51 PM (8 years ago)
Author:
mbutscher
Message:

branches/stable-2.0:

  • Bug fixed: Windows: "Invalid handle" error on command line bridge insertion plugins

branches/mbutscher/work:

  • Write last writing program version into wiki db
  • Several bug fixes with whoosh (index search)
  • Index search: Highlight found terms and jump to one found term on double-click
  • Bug fixed: Windows: "Invalid handle" error on command line bridge insertion plugins
Location:
branches
Files:
4 added
33 edited

Legend:

Unmodified
Added
Removed
  • branches/mbutscher/work/extensions/GnuplotClBridge.py

    r231 r234  
    117117#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    118118            popenObject = subprocess.Popen(cmdline, shell=True,
    119                     stderr=subprocess.PIPE)
     119                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     120                    stdin=subprocess.PIPE)
    120121            childErr = popenObject.stderr
    121122           
     123            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     124            # why this is necessary
     125            popenObject.stdin.close()
     126            popenObject.stdout.close()
     127
    122128            if u"noerror" in [a.strip() for a in insToken.appendices]:
    123129                childErr.read()
  • branches/mbutscher/work/extensions/GraphvizClBridge.py

    r231 r234  
    124124#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    125125            popenObject = subprocess.Popen(cmdline, shell=True,
    126                     stderr=subprocess.PIPE)
     126                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     127                    stdin=subprocess.PIPE)
    127128            childErr = popenObject.stderr
     129
     130            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     131            # why this is necessary
     132            popenObject.stdin.close()
     133            popenObject.stdout.close()
    128134
    129135            if u"noerror" in [a.strip() for a in insToken.appendices]:
  • branches/mbutscher/work/extensions/GraphvizStructureView.py

    r231 r234  
    342342#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    343343            popenObject = subprocess.Popen(cmdline, shell=True,
    344                     stderr=subprocess.PIPE)
     344                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     345                    stdin=subprocess.PIPE)
    345346            childErr = popenObject.stderr
     347
     348            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     349            # why this is necessary
     350            popenObject.stdin.close()
     351            popenObject.stdout.close()
    346352
    347353            if u"noerror" in [a.strip() for a in insParams]:
  • branches/mbutscher/work/extensions/MimeTexCGIBridge.py

    r231 r234  
    103103        # Run MimeTeX process
    104104        popenObject = subprocess.Popen(cmdline, shell=True,
    105                 stdout=subprocess.PIPE)
     105                 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     106                 stderr=subprocess.PIPE)
     107
    106108        childOut = popenObject.stdout
     109       
     110        # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     111        # why this is necessary
     112        popenObject.stdin.close()
     113        popenObject.stderr.close()
    107114
    108115        # Read stdout of process entirely
  • branches/mbutscher/work/extensions/PloticusClBridge.py

    r231 r234  
    128128#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    129129            popenObject = subprocess.Popen(cmdline, shell=True,
    130                     stderr=subprocess.PIPE)
     130                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     131                    stdin=subprocess.PIPE)
    131132            childErr = popenObject.stderr
    132            
     133
     134            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     135            # why this is necessary
     136            popenObject.stdin.close()
     137            popenObject.stdout.close()
     138
    133139            if u"noerror" in [a.strip() for a in insToken.appendices]:
    134140                childErr.read()
  • branches/mbutscher/work/lib/pwiki/DocPages.py

    r230 r234  
    11from __future__ import with_statement
    2 
    3 # import profilehooks
    4 # profile = profilehooks.profile(filename="profile.prf", immediate=False)
     2## import profilehooks
     3## profile = profilehooks.profile(filename="profile.prf", immediate=False)
    54
    65
     
    2726
    2827import Serialization
    29 # from Serialization import SerializeStream
    3028
    3129
     
    17361734        return valid
    17371735
    1738    
     1736
    17391737    def putIntoSearchIndex(self, threadstop=DUMBTHREADSTOP):
    17401738        """
    1741         Add or update the reverse index for the given docPage
     1739        Add or update the index for the given docPage
    17421740        """
    17431741        with self.textOperationLock:
  • branches/mbutscher/work/lib/pwiki/PersonalWikiFrame.py

    r231 r234  
    22from __future__ import with_statement
    33
    4 ## import hotshot
    5 ## _prof = hotshot.Profile("hotshot.prf")
    64## import profilehooks
    75## profile = profilehooks.profile(filename="profile.prf", immediate=False)
  • branches/mbutscher/work/lib/pwiki/SearchAndReplace.py

    r230 r234  
    16971697        """
    16981698        if self.searchStr == u"":
     1699            self.searchOpTree = AllWikiPagesNode(self)
     1700            return
     1701       
     1702        if self.indexSearch != "no":
     1703            # Search tree not used, but non-None value needed
    16991704            self.searchOpTree = AllWikiPagesNode(self)
    17001705            return
     
    18081813
    18091814
     1815    def getWhooshIndexQuery(self, wikiDocument):
     1816        from whoosh.qparser import QueryParser
     1817
     1818        qp = QueryParser("content", schema=wikiDocument.getWhooshIndexSchema())
     1819        q = qp.parse(self.searchStr)
     1820#         print "--getWhooshIndexQuery10", repr((qp, q))
     1821
     1822        return q
     1823
     1824
     1825    def hasWhooshHighlighting(self):
     1826        """
     1827        Return True iff call to highlightWhooshIndexFound() would work.
     1828        """
     1829        return self.indexSearch == "default"
     1830
     1831
     1832    def highlightWhooshIndexFound(self, content, docPage, before, after,
     1833            formatter=None):
     1834        """
     1835        Retrieve formatted output with highlighted search hits for a page.
     1836        formatter -- whoosh formatter or None (uses SimpleHtmlFormatter then)
     1837        """
     1838        if docPage is None:
     1839            return
     1840       
     1841        from whoosh import highlight
     1842
     1843        # TODO: Loop invariant, move out?
     1844        q = self.getWhooshIndexQuery(docPage.getWikiDocument())
     1845       
     1846        # Extract the terms the user mentioned
     1847        terms = [text for fieldname, text in q.all_terms()
     1848                if fieldname == "content"]
     1849       
     1850        analyzer = docPage.getWikiDocument().getWhooshIndexContentAnalyzer()
     1851       
     1852        # TODO: Length of before and after from config
     1853        fragmenter = highlight.ContextFragmenter(terms, (before + after) * 2,
     1854                before, after)
     1855
     1856        if formatter is None:
     1857            formatter = highlight.SimpleHtmlFormatter()
     1858       
     1859        return highlight.highlight(content, terms, analyzer,
     1860                     fragmenter, formatter, top=1)
     1861
     1862
    18101863    def hasParticularTextPosition(self):
    18111864        if self.indexSearch != "no":
     
    18551908        if self.searchOpTree is None:
    18561909            self.rebuildSearchOpTree()
    1857        
     1910
    18581911        if commonCache is None:
    18591912            commonCache = {}
    1860            
     1913
    18611914        self.listWikiPagesOp.beginWikiSearch(wikiDocument,
    18621915                commonCache=commonCache)
    1863            
     1916
    18641917        return self.searchOpTree.beginWikiSearch(wikiDocument,
    18651918                commonCache=commonCache)
  • branches/mbutscher/work/lib/pwiki/SearchAndReplaceDialogs.py

    r230 r234  
    1 ## import hotshot
    2 ## _prof = hotshot.Profile("hotshot.prf")
     1# import profilehooks
     2# profile = profilehooks.profile(filename="profile.prf", immediate=False)
    33
    44import sys, traceback, re, threading, time
     
    8080        self.occNumber = occNumber
    8181        return self
     82
     83
     84    def setHtmlDirectly(self, occHtml):
     85        self.occNumber = -1
     86        self.occCount = -1
     87        self.occHtml = occHtml
     88
    8289
    8390
     
    220227                context = before + after
    221228
    222                 if not sarOp.hasParticularTextPosition():
    223                     # No specific position to show as context, so show beginning of page
    224                     # Also, no occurrence counting possible
    225                     if context == 0:
     229                if sarOp.hasParticularTextPosition():
     230                    if context == 0 and not countOccurrences:
     231                        # No context, no occurrence counting
     232                        # -> just a list of found pages
    226233                        self.foundinfo = [_SearchResultItemInfo(w) for w in found]
    227234                    else:
    228                         for w in found:
    229                             text = wikiDocument.getWikiPageNoError(w).\
    230                                     getLiveTextNoTemplate()
    231                             if text is None:
    232                                 continue
    233                             self.foundinfo.append(
    234                                     _SearchResultItemInfo(w).buildOccurrence(
    235                                     text, before, after, (-1, -1), -1, 100))
    236                     threadstop.testRunning()
    237                 else:
    238                     if context == 0 and not countOccurrences:
    239                         # No context, no occurrence counting
    240                         self.foundinfo = [_SearchResultItemInfo(w) for w in found]
    241                     else:
     235                        # "As is" or regex search
    242236                        sarOp.beginWikiSearch(self.pWiki.getWikiDocument())
    243237                        try:
     
    270264                                info = _SearchResultItemInfo(w, occPos=pos,
    271265                                        maxOccCount=maxCountOccurrences)
    272 
     266   
    273267                                if countOccurrences:
    274268                                    occ = 1
     
    290284                        finally:
    291285                            sarOp.endWikiSearch()
    292 
     286                elif sarOp.hasWhooshHighlighting():
     287                    # Index search
     288                    if context == 0:
     289                        # No context, occurrence counting doesn't matter
     290                        # -> just a list of found pages
     291                        self.foundinfo = [_SearchResultItemInfo(w) for w in found]
     292                    else:
     293                        sarOp.beginWikiSearch(self.pWiki.getWikiDocument())
     294                        try:
     295                            for w in found:
     296                                threadstop.testRunning()
     297                                docPage = wikiDocument.getWikiPageNoError(w)
     298                                text = docPage.getLiveTextNoTemplate()
     299                                if text is None:
     300                                    continue
     301   
     302                                html, firstPos = sarOp.highlightWhooshIndexFound(
     303                                        text, docPage, before, after)
     304                               
     305                                info = _SearchResultItemInfo(w, occPos=(firstPos, firstPos))
     306                                info.setHtmlDirectly(html)
     307   
     308                                self.foundinfo.append(info)
     309                        finally:
     310                            sarOp.endWikiSearch()
     311                else:  # not sarOp.hasParticularTextPosition():
     312                    # No specific position to show as context, so show beginning of page
     313                    # Also, no occurrence counting possible
     314                    if context == 0:
     315                        self.foundinfo = [_SearchResultItemInfo(w) for w in found]
     316                    else:
     317                        for w in found:
     318                            text = wikiDocument.getWikiPageNoError(w).\
     319                                    getLiveTextNoTemplate()
     320                            if text is None:
     321                                continue
     322                            self.foundinfo.append(
     323                                    _SearchResultItemInfo(w).buildOccurrence(
     324                                    text, before, after, (-1, -1), -1, 100))
     325                    threadstop.testRunning()
     326               
    293327                threadstop.testRunning()
    294328                self.isShowingSearching = False
     
    10931127
    10941128
     1129#     @profile
    10951130    def _refreshPageList(self):
    10961131        sarOp = self._buildSearchReplaceOperation()
  • branches/mbutscher/work/lib/pwiki/wikidata/WikiDataManager.py

    r231 r234  
    11from __future__ import with_statement
     2
    23
    34from weakref import WeakValueDictionary
     
    12911292           
    12921293            if self.isSearchIndexEnabled():
    1293                 # Step four: update reverse index
     1294                # Step four: update index
    12941295                for wikiWord in wikiWords:
    12951296                    progresshandler.update(step, _(u"Update index of %s") % wikiWord)
     
    15961597            return result
    15971598        else:
    1598             # Processing reverse index search
    1599             from whoosh.qparser import QueryParser
    1600            
     1599            # Processing index search
    16011600            threadstop.testRunning()
    16021601            if not self.isSearchIndexEnabled():
    16031602                return []
    16041603
    1605             qp = QueryParser("content", schema=self._getSearchIndexSchema())
    1606             q = qp.parse(sarOp.searchStr)
     1604            q = sarOp.getWhooshIndexQuery(self)
    16071605            s = self.getSearchIndex().searcher()
    16081606            threadstop.testRunning()
    1609             resultList = s.search(q)
     1607            resultList = s.search(q, limit=None)
     1608           
     1609#             docnumList = [(rd.docnum, rd["unifName"]) for rd in resultList]
     1610#             
     1611#             docnumList.sort()
     1612#             docpp = "\n".join(["%3i %s" % rd for rd in docnumList])
     1613#
     1614#             print "--docResults"
     1615#             print docpp.encode("mbcs", "replace")
     1616
    16101617            result = [rd["unifName"][9:] for rd in resultList
    16111618                    if rd["unifName"].startswith(u"wikipage/")]
     
    16141621            return result
    16151622
    1616    
     1623
     1624    @staticmethod
     1625    def getWhooshIndexContentAnalyzer():
     1626        from whoosh.analysis import StandardAnalyzer       
     1627        return StandardAnalyzer(stoplist=None)
     1628
     1629
     1630
    16171631    _REV_SEARCH_INDEX_SCHEMA = None
    16181632   
    16191633    @staticmethod
    1620     def _getSearchIndexSchema():
     1634    def getWhooshIndexSchema():
    16211635        if WikiDataManager._REV_SEARCH_INDEX_SCHEMA is None:
    16221636            from whoosh.fields import Schema, ID, NUMERIC, TEXT
    1623             from whoosh.analysis import StandardAnalyzer
    16241637           
    16251638            WikiDataManager._REV_SEARCH_INDEX_SCHEMA = Schema(
    16261639                    unifName=ID(stored=True, unique=True),
    16271640                    modTimestamp=NUMERIC(), content=TEXT(
    1628                     analyzer=StandardAnalyzer(stoplist=None)))
     1641                    analyzer=WikiDataManager.getWhooshIndexContentAnalyzer()))
    16291642
    16301643        return WikiDataManager._REV_SEARCH_INDEX_SCHEMA
     1644   
     1645   
    16311646
    16321647
     
    16871702
    16881703            if clear or not whoosh.index.exists_in(indexPath):
    1689                 schema = self._getSearchIndexSchema()
     1704                schema = self.getWhooshIndexSchema()
    16901705                whoosh.index.create_in(indexPath, schema)
    16911706
     
    17411756
    17421757
    1743 
    1744     def putIntoSearchIndex(self, wikiPage):
    1745         """
    1746         Add or update the reverse index for the given docPage
    1747         """
    1748         if not self.isSearchIndexEnabled():
    1749             return
    1750 
    1751         if isinstance(wikiPage, AliasWikiPage):
    1752             wikiPage = WikiPage(self, wikiWord)
    1753 
    1754         content = wikiPage.getLiveText()
    1755 
    1756         try:
    1757             searchIdx = self.getSearchIndex()
    1758             writer = searchIdx.writer()
    1759            
    1760             unifName = wikiPage.getUnifiedPageName()
    1761            
    1762             writer.delete_by_term("unifName", unifName)
    1763             writer.add_document(unifName=unifName,
    1764                     modTimestamp=wikiPage.getTimestamps()[0],
    1765                     content=content)
    1766         except:
    1767             writer.cancel()
    1768             raise
    1769 
    1770         writer.commit()
    1771        
    1772    
    17731758    def removeFromSearchIndex(self, unifName):
    17741759        if not self.isSearchIndexEnabled():
  • branches/mbutscher/work/lib/pwiki/wikidata/compact_sqlite/DbStructure.py

    r216 r234  
    1111from os.path import exists, join
    1212
     13import Consts
    1314from pwiki.WikiExceptions import *
    1415from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, applyBinCompact, \
     
    12841285        connwrap.execSql("insert or replace into settings(key, value) "
    12851286                "values ('lastwritever', '"+str(VERSION_DB)+"')")
     1287               
     1288        # Write which program version at last wrote to database
     1289        connwrap.execSql("insert or replace into settings(key, value) "
     1290                "values ('lastwriteprogver.branchtag', '"+Consts.VERSION_TUPLE[0]+"')")
     1291        connwrap.execSql("insert or replace into settings(key, value) "
     1292                "values ('lastwriteprogver.major', '"+str(Consts.VERSION_TUPLE[1])+"')")
     1293        connwrap.execSql("insert or replace into settings(key, value) "
     1294                "values ('lastwriteprogver.minor', '"+str(Consts.VERSION_TUPLE[2])+"')")
     1295        connwrap.execSql("insert or replace into settings(key, value) "
     1296                "values ('lastwriteprogver.sub', '"+str(Consts.VERSION_TUPLE[3])+"')")
     1297        connwrap.execSql("insert or replace into settings(key, value) "
     1298                "values ('lastwriteprogver.patch', '"+str(Consts.VERSION_TUPLE[4])+"')")
    12861299    except sqlite.ReadOnlyDbError:
    12871300        pass
  • branches/mbutscher/work/lib/pwiki/wikidata/original_gadfly/DbStructure.py

    r214 r234  
    1212import glob
    1313
     14import Consts
    1415from pwiki.WikiExceptions import *
    1516from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, \
     
    11301131    try:
    11311132        setSettingsValue(connwrap, "lastwritever", str(VERSION_DB))
     1133       
     1134        # Write which program version at last wrote to database
     1135        setSettingsValue(connwrap, "lastwriteprogver.branchtag", Consts.VERSION_TUPLE[0])
     1136        setSettingsValue(connwrap, "lastwriteprogver.major", str(Consts.VERSION_TUPLE[1]))
     1137        setSettingsValue(connwrap, "lastwriteprogver.minor", str(Consts.VERSION_TUPLE[2]))
     1138        setSettingsValue(connwrap, "lastwriteprogver.sub", str(Consts.VERSION_TUPLE[3]))
     1139        setSettingsValue(connwrap, "lastwriteprogver.patch", str(Consts.VERSION_TUPLE[4]))
    11321140    except IOError:
    11331141        pass
  • branches/mbutscher/work/lib/pwiki/wikidata/original_sqlite/DbStructure.py

    r217 r234  
    1111from os.path import exists, join
    1212
     13import Consts
    1314from pwiki.WikiExceptions import *
    1415from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, applyBinCompact, \
     
    995996    """
    996997    try:
    997         # Write which version at last wrote to database
     998        # Write which format version at last wrote to database
    998999        connwrap.execSql("insert or replace into settings(key, value) "
    9991000                "values ('lastwritever', '"+str(VERSION_DB)+"')")
     1001
     1002        # Write which program version at last wrote to database
     1003        connwrap.execSql("insert or replace into settings(key, value) "
     1004                "values ('lastwriteprogver.branchtag', '"+Consts.VERSION_TUPLE[0]+"')")
     1005        connwrap.execSql("insert or replace into settings(key, value) "
     1006                "values ('lastwriteprogver.major', '"+str(Consts.VERSION_TUPLE[1])+"')")
     1007        connwrap.execSql("insert or replace into settings(key, value) "
     1008                "values ('lastwriteprogver.minor', '"+str(Consts.VERSION_TUPLE[2])+"')")
     1009        connwrap.execSql("insert or replace into settings(key, value) "
     1010                "values ('lastwriteprogver.sub', '"+str(Consts.VERSION_TUPLE[3])+"')")
     1011        connwrap.execSql("insert or replace into settings(key, value) "
     1012                "values ('lastwriteprogver.patch', '"+str(Consts.VERSION_TUPLE[4])+"')")
    10001013    except sqlite.ReadOnlyDbError:
    10011014        pass
  • branches/mbutscher/work/lib/whoosh/analysis.py

    r230 r234  
    5454"""
    5555
     56import copy, re
    5657from array import array
    57 import copy, re
     58from collections import deque
    5859from itertools import chain
    5960
     
    8384# Token object
    8485
     86
     87# Mine:
    8588class Token(object):
     89    __slots__ = (
     90        "positions",
     91        "chars",
     92        "stopped",
     93        "boost",
     94        "removestops",
     95        "mode",
     96
     97        "startchar",
     98        "endchar",
     99        "text",
     100        "original",
     101
     102        "__dict__"
     103    )
     104
     105
    86106    """
    87107    Represents a "token" (usually a word) extracted from the source text being
     
    113133   
    114134    def __init__(self, positions=False, chars=False, removestops=True, mode='',
    115                  **kwargs):
     135                 stopped=False, boost=1.0, startchar=None, endchar=None,
     136                 text=None, original=None, **kwargs):
    116137        """
    117138        :param positions: Whether tokens should have the token position in the
     
    123144        :param mode: contains a string describing the purpose for which the
    124145            analyzer is being called, i.e. 'index' or 'query'.
    125         """
    126        
     146
     147            Do not modify the parameters after mode. They are needed for
     148            copying only
     149        """
    127150        self.positions = positions
    128151        self.chars = chars
    129         self.stopped = False
    130         self.boost = 1.0
    131152        self.removestops = removestops
    132153        self.mode = mode
    133         self.__dict__.update(kwargs)
    134    
    135     def __repr__(self):
     154
     155        self.stopped = stopped
     156        self.boost = boost
     157       
     158        self.startchar = startchar
     159        self.endchar = endchar
     160        self.text = text
     161        self.original = original
     162
     163        if kwargs:
     164            self.__dict__.update(kwargs)
     165
     166   
     167    def __repr__(self):   # TODO!!!
    136168        parms = ", ".join("%s=%r" % (name, value)
    137169                          for name, value in self.__dict__.iteritems())
     
    139171       
    140172    def copy(self):
    141         return copy.copy(self)
     173        return Token(positions=self.positions, chars=self.chars,
     174                removestops=self.removestops, mode=self.mode,
     175                stopped=self.stopped, boost=self.boost, startchar=self.startchar,
     176                endchar=self.endchar, text=self.text, original=self.original,
     177                **self.__dict__)
     178
     179
     180
     181# mchaput:
     182# class Token(object):
     183#     """
     184#     Represents a "token" (usually a word) extracted from the source text being
     185#     indexed.
     186#     
     187#     See "Advanced analysis" in the user guide for more information.
     188#     
     189#     Because object instantiation in Python is slow, tokenizers should create
     190#     ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
     191#     each time.
     192#     
     193#     This trick means that consumers of tokens (i.e. filters) must never try to
     194#     hold onto the token object between loop iterations, or convert the token
     195#     generator into a list. Instead, save the attributes between iterations,
     196#     not the object::
     197#     
     198#         def RemoveDuplicatesFilter(self, stream):
     199#             # Removes duplicate words.
     200#             lasttext = None
     201#             for token in stream:
     202#                 # Only yield the token if its text doesn't
     203#                 # match the previous token.
     204#                 if lasttext != token.text:
     205#                     yield token
     206#                 lasttext = token.text
     207#
     208#     ...or, call token.copy() to get a copy of the token object.
     209#     """
     210#     
     211#     def __init__(self, positions=False, chars=False, removestops=True, mode='',
     212#                  **kwargs):
     213#         """
     214#         :param positions: Whether tokens should have the token position in the
     215#             'pos' attribute.
     216#         :param chars: Whether tokens should have character offsets in the
     217#             'startchar' and 'endchar' attributes.
     218#         :param removestops: whether to remove stop words from the stream (if
     219#             the tokens pass through a stop filter).
     220#         :param mode: contains a string describing the purpose for which the
     221#             analyzer is being called, i.e. 'index' or 'query'.
     222#         """
     223#         
     224#         self.positions = positions
     225#         self.chars = chars
     226#         self.stopped = False
     227#         self.boost = 1.0
     228#         self.removestops = removestops
     229#         self.mode = mode
     230#         self.__dict__.update(kwargs)
     231#     
     232#     def __repr__(self):
     233#         parms = ", ".join("%s=%r" % (name, value)
     234#                           for name, value in self.__dict__.iteritems())
     235#         return "%s(%s)" % (self.__class__.__name__, parms)
     236#         
     237#     def copy(self):
     238#         # This is faster than using the copy module
     239#         return Token(**self.__dict__.copy())
     240
     241
     242
     243# mchaput modif:
     244# class Token(object):
     245#     """
     246#     Represents a "token" (usually a word) extracted from the source text being
     247#     indexed.
     248#     
     249#     See "Advanced analysis" in the user guide for more information.
     250#     
     251#     Because object instantiation in Python is slow, tokenizers should create
     252#     ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
     253#     each time.
     254#     
     255#     This trick means that consumers of tokens (i.e. filters) must never try to
     256#     hold onto the token object between loop iterations, or convert the token
     257#     generator into a list. Instead, save the attributes between iterations,
     258#     not the object::
     259#     
     260#         def RemoveDuplicatesFilter(self, stream):
     261#             # Removes duplicate words.
     262#             lasttext = None
     263#             for token in stream:
     264#                 # Only yield the token if its text doesn't
     265#                 # match the previous token.
     266#                 if lasttext != token.text:
     267#                     yield token
     268#                 lasttext = token.text
     269#
     270#     ...or, call token.copy() to get a copy of the token object.
     271#     """
     272#     
     273#     def __init__(self, positions=False, chars=False, removestops=True, mode='',
     274#                  **kwargs):
     275#         """
     276#         :param positions: Whether tokens should have the token position in the
     277#             'pos' attribute.
     278#         :param chars: Whether tokens should have character offsets in the
     279#             'startchar' and 'endchar' attributes.
     280#         :param removestops: whether to remove stop words from the stream (if
     281#             the tokens pass through a stop filter).
     282#         :param mode: contains a string describing the purpose for which the
     283#             analyzer is being called, i.e. 'index' or 'query'.
     284#         """
     285#         
     286#         self.positions = positions
     287#         self.chars = chars
     288#         self.stopped = False
     289#         self.boost = 1.0
     290#         self.removestops = removestops
     291#         self.mode = mode
     292#         self.__dict__.update(kwargs)
     293#     
     294#     def __repr__(self):
     295#         parms = ", ".join("%s=%r" % (name, value)
     296#                           for name, value in self.__dict__.iteritems())
     297#         return "%s(%s)" % (self.__class__.__name__, parms)
     298#         
     299#     def copy(self):
     300#         # This is faster than using the copy module
     301#         return Token(**self.__dict__)
     302
     303
     304
    142305
    143306
     
    184347        t = Token(positions, chars, removestops=removestops, mode=mode)
    185348        t.text = value
     349        t.boost=1.0
    186350        if keeporiginal:
    187351            t.original = value
     
    203367    """
    204368   
    205     __inittypes__ = dict(expression=unicode, gaps=bool)
    206    
    207     def __init__(self, expression=r"\w+(\.?\w+)*", gaps=False):
     369    __inittypes__ = dict(expression=unicode, gaps=bool, lowercase=bool)
     370   
     371    def __init__(self, expression=r"\w+(\.?\w+)*", gaps=False, lowercase=False):
    208372        """
    209373        :param expression: A regular expression object or string. Each match
     
    214378            than matching on the expression.
    215379        """
    216        
     380
    217381        if isinstance(expression, basestring):
    218382            self.expression = re.compile(expression, re.UNICODE)
     
    220384            self.expression = expression
    221385        self.gaps = gaps
     386       
     387        self.lowercase = lowercase
     388       
    222389   
    223390    def __eq__(self, other):
     
    245412       
    246413        assert isinstance(value, unicode), "%r is not unicode" % value
    247        
     414
     415        if self.lowercase:
     416            lowervalue = value.lower()
     417        else:
     418            lowervalue = value
     419
    248420        t = Token(positions, chars, removestops=removestops, mode=mode)
    249421        if not tokenize:
    250             t.original = t.text = value
     422            t.original = value
     423            t.text = lowervalue
     424            t.boost = 1.0
    251425            if positions: t.pos = start_pos
    252426            if chars:
     
    256430        elif not self.gaps:
    257431            # The default: expression matches are used as tokens
    258             for pos, match in enumerate(self.expression.finditer(value)):
    259                 t.text = match.group(0)
     432            for pos, match in enumerate(self.expression.finditer(lowervalue)):
     433                ms = match.start()
     434                me = match.end()
     435                t.text = lowervalue[ms:me]  # match.group(0)
    260436                if keeporiginal:
    261                     t.original = t.text
     437                    t.original = value[ms:me]
    262438                t.stopped = False
    263439                if positions:
    264440                    t.pos = start_pos + pos
    265441                if chars:
    266                     t.startchar = start_char + match.start()
    267                     t.endchar = start_char + match.end()
     442                    t.startchar = start_char + ms
     443                    t.endchar = start_char + me
    268444                yield t
    269445        else:
     
    272448            prevend = 0
    273449            pos = start_pos
    274             for match in self.expression.finditer(value):
     450            for match in self.expression.finditer(lowervalue):
    275451                start = prevend
    276452                end = match.start()
    277                 text = value[start:end]
     453                text = lowervalue[start:end]
    278454                if text:
    279455                    t.text = text
     456                    t.boost = 1.0
    280457                    if keeporiginal:
    281                         t.original = t.text
     458                        t.original = value[start:end]
    282459                    t.stopped = False
    283460                    if positions:
     
    296473            if prevend < len(value):
    297474                t.text = value[prevend:]
     475                t.boost = 1.0
    298476                if keeporiginal:
    299477                    t.original = t.text
     
    365543        if not tokenize:
    366544            t.original = t.text = value
     545            t.boost = 1.0
    367546            if positions: t.pos = start_pos
    368547            if chars:
     
    382561                    if currentchar > startchar:
    383562                        t.text = text
     563                        t.boost = 1.0
    384564                        if keeporiginal:
    385565                            t.original = t.text
     
    398578            if currentchar > startchar:
    399579                t.text = value[startchar:currentchar]
     580                t.boost = 1.0
    400581                if keeporiginal:
    401582                    t.original = t.text
     
    11691350        "the-sign", "sign-of", "of-four"
    11701351       
    1171     This can be used in fields dedicated to phrase searching. In the example
    1172     above,  the three "bi-word" tokens will be faster to find than the four
    1173     original words since there are fewer of them and they will be much less
    1174     frequent (especially compared to words like "the" and "of").
     1352    This can be used to create fields for pseudo-phrase searching, where if
     1353    all the terms match the document probably contains the phrase, but the
     1354    searching is faster than actually doing a phrase search on individual word
     1355    terms.
     1356   
     1357    The ``BiWordFilter`` is much faster than using the otherwise equivalent
     1358    ``ShingleFilter(2)``.
    11751359    """
    11761360   
     
    12131397            if positions: prev_pos = ps
    12141398       
    1215         # If at no bi-words were emitted, that is, the token stream only had
     1399        # If no bi-words were emitted, that is, the token stream only had
    12161400        # a single token, then emit that single token.
    12171401        if not atleastone:
    12181402            yield token
    12191403       
     1404
     1405class ShingleFilter(Filter):
     1406    """Merges a certain number of adjacent tokens into multi-word tokens, so
     1407    that for example::
     1408   
     1409        "better", "a", "witty", "fool", "than", "a", "foolish", "wit"
     1410       
     1411    with ``ShingleFilter(3, ' ')`` becomes::
     1412   
     1413        'better a witty', 'a witty fool', 'witty fool than', 'fool than a',
     1414        'than a foolish', 'a foolish wit'
     1415   
     1416    This can be used to create fields for pseudo-phrase searching, where if
     1417    all the terms match the document probably contains the phrase, but the
     1418    searching is faster than actually doing a phrase search on individual word
     1419    terms.
     1420   
     1421    If you're using two-word shingles, you should use the functionally
     1422    equivalent ``BiWordFilter`` instead because it's faster than
     1423    ``ShingleFilter``.
     1424    """
     1425   
     1426    def __init__(self, size=2, sep="-"):
     1427        self.size = size
     1428        self.sep = sep
     1429       
     1430    def __call__(self, tokens):
     1431        size = self.size
     1432        sep = self.sep
     1433        buf = deque()
     1434        atleastone = False
     1435       
     1436        def make_token():
     1437            tk = buf[0]
     1438            tk.text = sep.join([t.text for t in buf])
     1439            if tk.chars:
     1440                tk.endchar = buf[-1].endchar
     1441            return tk
     1442       
     1443        for token in tokens:
     1444            buf.append(token.copy())
     1445            if len(buf) == size:
     1446                atleastone = True
     1447                yield make_token()
     1448                buf.popleft()
     1449       
     1450        # If no shingles were emitted, that is, the token stream had fewer than
     1451        # 'size' tokens, then emit a single token with whatever tokens there
     1452        # were
     1453        if not atleastone:
     1454            yield make_token()
     1455
    12201456
    12211457class BoostTextFilter(Filter):
     
    12601496
    12611497
    1262 class DeliminatedAttributeFilter(Filter):
     1498class DelimitedAttributeFilter(Filter):
    12631499    """Looks for delimiter characters in the text of each token and stores the
    12641500    data after the delimiter in a named attribute on the token.
     
    13231559
    13241560class DoubleMetaphoneFilter(Filter):
    1325     def __init__(self, primary_boost=3.0):
     1561    """Transforms the text of the tokens using Lawrence Philips's Double
     1562    Metaphone algorithm. This algorithm attempts to encode words in such a way
     1563    that similar-sounding words reduce to the same code. This may be useful for
     1564    fields containing the names of people and places, and other uses where
     1565    tolerance of spelling differences is desireable.
     1566    """
     1567   
     1568    def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
     1569        """
     1570        :param primary_boost: the boost to apply to the token containing the
     1571            primary code.
     1572        :param secondary_boost: the boost to apply to the token containing the
     1573            secondary code, if any.
     1574        :param combine: if True, the original unencoded tokens are kept in the
     1575            stream, preceding the encoded tokens.
     1576        """
     1577       
    13261578        self.primary_boost = primary_boost
     1579        self.secondary_boost = secondary_boost
     1580        self.combine = combine
    13271581       
    13281582    def __eq__(self, other):
     
    13331587    def __call__(self, tokens):
    13341588        primary_boost = self.primary_boost
     1589        secondary_boost = self.secondary_boost
     1590        combine = self.combine
    13351591       
    13361592        for t in tokens:
     1593            if combine:
     1594                yield t
     1595           
    13371596            primary, secondary = double_metaphone(t.text)
     1597            b = t.boost
     1598            # Overwrite the token's text and boost and yield it
    13381599            if primary:
    1339                 # Save the original boost
    1340                 b = t.boost
    1341                 # Overwrite the token's text and boost and yield it
    13421600                t.text = primary
    13431601                t.boost = b * primary_boost
    13441602                yield t
    1345                 # Restored the original boost
    1346                 t.boost = b
    13471603            if secondary:
    13481604                t.text = secondary
     1605                t.boost = b * secondary_boost
    13491606                yield t
     1607               
     1608
     1609class SubstitutionFilter(Filter):
     1610    """Performas a regular expression substitution on the token text.
     1611   
     1612    This is especially useful for removing text from tokens, for example
     1613    hyphens::
     1614   
     1615        ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "")
     1616       
     1617    Because it has the full power of the re.sub() method behind it, this filter
     1618    can perform some fairly complex transformations. For example, to take tokens
     1619    like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', 'f=e'``::
     1620   
     1621        # Analyzer that swaps the text on either side of an equal sign
     1622        ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1")
     1623    """
     1624   
     1625    def __init__(self, pattern, replacement):
     1626        """
     1627        :param pattern: a pattern string or compiled regular expression object
     1628            describing the text to replace.
     1629        :param replacement: the substitution text.
     1630        """
     1631       
     1632        if isinstance(pattern, basestring):
     1633            pattern = re.compile(pattern, re.UNICODE)
     1634        self.pattern = pattern
     1635        self.replacement = replacement
     1636   
     1637    def __eq__(self, other):
     1638        return (other and self.__class__ is other.__class__
     1639                and self.pattern == other.pattern
     1640                and self.replacement == other.replacement)
     1641   
     1642    def __call__(self, tokens):
     1643        pattern = self.pattern
     1644        replacement = self.replacement
     1645       
     1646        for t in tokens:
     1647            t.text = pattern.sub(replacement, t.text)
     1648            yield t
     1649
    13501650
    13511651# Analyzers
     
    14831783    """
    14841784   
    1485     ret = RegexTokenizer(expression=expression, gaps=gaps)
    1486     chain = ret | LowercaseFilter()
     1785#     ret = RegexTokenizer(expression=expression, gaps=gaps)
     1786#     chain = ret | LowercaseFilter()
     1787    chain = RegexTokenizer(expression=expression, gaps=gaps, lowercase=True)
    14871788    if stoplist is not None:
    14881789        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
  • branches/mbutscher/work/lib/whoosh/fields.py

    r230 r234  
    6565      being updated.
    6666     
     67    * multitoken_query is a string indicating what kind of query to use when
     68      a "word" in a user query parses into multiple tokens. The string is
     69      interpreted by the query parser. The strings understood by the default
     70      query parser are "first" (use first token only), "and" (join the tokens
     71      with an AND query), "or" (join the tokens with OR), and "phrase" (join
     72      the tokens with a phrase query).
     73     
    6774    The constructor for the base field type simply lets you supply your own
    6875    configured field format, vector format, and scorable and stored values.
    6976    Subclasses may configure some or all of this for you.
    70    
    7177    """
    7278   
    7379    format = vector = scorable = stored = unique = None
    7480    indexed = True
     81    multitoken_query = "first"
    7582    __inittypes__ = dict(format=Format, vector=Format,
    7683                         scorable=bool, stored=bool, unique=bool)
    7784   
    7885    def __init__(self, format, vector=None, scorable=False, stored=False,
    79                  unique=False):
     86                 unique=False, multitoken_query="first"):
    8087        self.format = format
    8188        self.vector = vector
     
    8390        self.stored = stored
    8491        self.unique = unique
     92        self.multitoken_query = multitoken_query
    8593   
    8694    def __repr__(self):
  • branches/mbutscher/work/lib/whoosh/filedb/fileindex.py

    r231 r234  
    144144   
    145145    # Generation
    146     stream.read_int()
     146    index_gen = stream.read_int()
     147    assert gen == index_gen
    147148   
    148149    segment_counter = stream.read_int()
     
    156157
    157158def _next_segment_name(self):
    158         #Returns the name of the next segment in sequence.
    159         if self.segment_num_lock is None:
    160             self.segment_num_lock = Lock()
    161        
    162         if self.segment_num_lock.acquire():
    163             try:
    164                 self.segment_counter += 1
    165                 return
    166             finally:
    167                 self.segment_num_lock.release()
    168         else:
    169             raise LockError
     159    #Returns the name of the next segment in sequence.
     160    if self.segment_num_lock is None:
     161        self.segment_num_lock = Lock()
     162   
     163    if self.segment_num_lock.acquire():
     164        try:
     165            self.segment_counter += 1
     166            return
     167        finally:
     168            self.segment_num_lock.release()
     169    else:
     170        raise LockError
    170171
    171172
    172173def _clean_files(storage, indexname, gen, segments):
    173         # Attempts to remove unused index files (called when a new generation
    174         # is created). If existing Index and/or reader objects have the files
    175         # open, they may not be deleted immediately (i.e. on Windows) but will
    176         # probably be deleted eventually by a later call to clean_files.
    177 
    178         current_segment_names = set(s.name for s in segments)
    179 
    180         tocpattern = _toc_pattern(indexname)
    181         segpattern = _segment_pattern(indexname)
    182 
    183         todelete = set()
    184         for filename in storage:
    185             tocm = tocpattern.match(filename)
    186             segm = segpattern.match(filename)
    187             if tocm:
    188                 if int(tocm.group(1)) != gen:
    189                     todelete.add(filename)
    190             elif segm:
    191                 name = segm.group(1)
    192                 if name not in current_segment_names:
    193                     todelete.add(filename)
    194        
    195         for filename in todelete:
    196             try:
    197                 storage.delete_file(filename)
    198             except OSError:
    199                 # Another process still has this file open
    200                 pass
     174    # Attempts to remove unused index files (called when a new generation
     175    # is created). If existing Index and/or reader objects have the files
     176    # open, they may not be deleted immediately (i.e. on Windows) but will
     177    # probably be deleted eventually by a later call to clean_files.
     178
     179    current_segment_names = set(s.name for s in segments)
     180
     181    tocpattern = _toc_pattern(indexname)
     182    segpattern = _segment_pattern(indexname)
     183
     184    todelete = set()
     185    for filename in storage:
     186        tocm = tocpattern.match(filename)
     187        segm = segpattern.match(filename)
     188        if tocm:
     189            if int(tocm.group(1)) != gen:
     190                todelete.add(filename)
     191        elif segm:
     192            name = segm.group(1)
     193            if name not in current_segment_names:
     194                todelete.add(filename)
     195   
     196    for filename in todelete:
     197        try:
     198            storage.delete_file(filename)
     199        except OSError:
     200            # Another process still has this file open
     201            pass
    201202
    202203
  • branches/mbutscher/work/lib/whoosh/filedb/filepostings.py

    r231 r234  
    334334        self.stringids = stringids
    335335       
    336         assert postfile.get_int(offset) == -48626
     336        magic = postfile.get_int(offset)
     337        assert magic == -48626
     338       
    337339        self.blockcount = postfile.get_uint(offset + _INT_SIZE)
    338340        self.baseoffset = offset + _INT_SIZE * 2
  • branches/mbutscher/work/lib/whoosh/formats.py

    r231 r234  
    187187        for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
    188188            freqs[t.text] += 1
    189             weights[t.text] += int(t.boost)
     189            weights[t.text] += t.boost
    190190       
    191191        encode = self.encode
  • branches/mbutscher/work/lib/whoosh/highlight.py

    r232 r234  
    2222from __future__ import division
    2323from heapq import nlargest
    24 from cgi import escape as htmlescape
     24from collections import deque
     25
     26# from cgi import escape as htmlescape
     27from pwiki.StringOps import escapeHtml as htmlescape
    2528
    2629# Fragment object
     
    187190    """
    188191   
    189     def __init__(self, termset, maxchars=200, surround=20):
     192    def __init__(self, termset, maxchars=200, charsbefore=20, charsafter=20):
    190193        """
    191194        :param termset: A collection (probably a set or frozenset) containing
     
    193196        :param maxchars: The maximum number of characters allowed in a
    194197            fragment.
    195         :param surround: The number of extra characters of context to add both
    196             before the first matched term and after the last matched term.
    197         """
    198        
     198        :param charsbefore: The number of extra characters of context to add
     199            before the first matched term .
     200        :param charsafter: The number of extra characters of context to add
     201            after the last matched term.
     202        """
    199203        self.maxchars = maxchars
    200         self.charsbefore = self.charsafter = surround
     204        self.charsbefore = charsbefore
     205        self.charsafter = charsafter
     206
    201207   
    202208    def __call__(self, text, tokens):
     
    205211        charsafter = self.charsafter
    206212       
    207         current = []
     213        current = deque()
    208214        currentlen = 0
    209215        countdown = -1
     
    222228                if countdown < 0 or currentlen >= maxchars:
    223229                    yield Fragment(current)
    224                     current = []
     230                    current.clear()
    225231                    currentlen = 0
    226            
     232
    227233            else:
    228                 while current and currentlen > charsbefore:
    229                     t = current.pop(0)
     234#                 while current and currentlen > charsbefore: len(current) can't be 0 if currentlen > 0 and
     235#                         charsbefore should be >= 0
     236                while currentlen > charsbefore:
     237                    t = current.popleft()
    230238                    currentlen -= t.endchar - t.startchar
    231239
     
    340348
    341349
     350class SimpleHtmlFormatter(object):
     351    """Returns a string in which the matched terms are enclosed in <b></b>.
     352    """
     353   
     354    def __init__(self, between=u"... "):
     355        """
     356        :param between: the text to add between fragments.
     357        """
     358        self.between = between
     359        self.firstPos = -1
     360       
     361    def _format_fragment(self, text, fragment):
     362        output = []
     363        index = fragment.startchar
     364       
     365        for t in fragment.matches:
     366            if t.startchar > index:
     367                output.append(htmlescape(text[index:t.startchar]))
     368
     369            ttxt = htmlescape(text[t.startchar:t.endchar])
     370            if t.matched:
     371                ttxt = "<b>%s</b>" % ttxt
     372                if self.firstPos == -1:
     373                    self.firstPos = t.startchar
     374                else:
     375                    self.firstPos = min(self.firstPos, t.startchar)
     376
     377            output.append(ttxt)
     378            index = t.endchar
     379       
     380        output.append(htmlescape(text[index:fragment.endchar]))
     381        return u"".join(output)
     382
     383    def __call__(self, text, fragments):
     384        return self.between.join([self._format_fragment(text, fragment)
     385                                  for fragment in fragments]), self.firstPos
     386
     387
     388
     389
    342390class HtmlFormatter(object):
    343391    """Returns a string containing HTML formatting around the matched terms.
     
    428476
    429477
    430 class GenshiFormatter(object):
    431     """Returns a Genshi event stream containing HTML formatting around the
    432     matched terms.
    433     """
    434    
    435     def __init__(self, qname="strong", between="..."):
    436         """
    437         :param qname: the QName for the tag to wrap around matched terms.
    438         :param between: the text to add between fragments.
    439         """
    440        
    441         self.qname = qname
    442         self.between = between
    443        
    444         from genshi.core import START, END, TEXT, Attrs, Stream #@UnresolvedImport
    445         self.START, self.END, self.TEXT = START, END, TEXT
    446         self.Attrs, self.Stream = Attrs, Stream
    447 
    448     def _add_text(self, text, output):
    449         if output and output[-1][0] == self.TEXT:
    450             output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
    451         else:
    452             output.append((self.TEXT, text, (None, -1, -1)))
    453 
    454     def _format_fragment(self, text, fragment):
    455         START, TEXT, END, Attrs = self.START, self.TEXT, self.END, self.Attrs
    456         qname = self.qname
    457         output = []
    458        
    459         index = fragment.startchar
    460         lastmatched = False
    461         for t in fragment.matches:
    462             if t.startchar > index:
    463                 if lastmatched:
    464                     output.append((END, qname, (None, -1, -1)))
    465                     lastmatched = False
    466                 self._add_text(text[index:t.startchar], output)
    467            
    468             ttxt = text[t.startchar:t.endchar]
    469             if not lastmatched:
    470                 output.append((START, (qname, Attrs()), (None, -1, -1)))
    471                 lastmatched = True
    472             output.append((TEXT, ttxt, (None, -1, -1)))
    473                                    
    474             index = t.endchar
    475        
    476         if lastmatched:
    477             output.append((END, qname, (None, -1, -1)))
    478        
    479         return output
    480 
    481     def __call__(self, text, fragments):
    482         output = []
    483         first = True
    484         for fragment in fragments:
    485             if not first:
    486                 self._add_text(self.between, output)
    487             first = False
    488             output += self._format_fragment(text, fragment)
    489        
    490         return self.Stream(output)
     478# class GenshiFormatter(object):
     479#     """Returns a Genshi event stream containing HTML formatting around the
     480#     matched terms.
     481#     """
     482#    
     483#     def __init__(self, qname="strong", between="..."):
     484#         """
     485#         :param qname: the QName for the tag to wrap around matched terms.
     486#         :param between: the text to add between fragments.
     487#         """
     488#        
     489#         self.qname = qname
     490#         self.between = between
     491#        
     492#         from genshi.core import START, END, TEXT, Attrs, Stream #@UnresolvedImport
     493#         self.START, self.END, self.TEXT = START, END, TEXT
     494#         self.Attrs, self.Stream = Attrs, Stream
     495#
     496#     def _add_text(self, text, output):
     497#         if output and output[-1][0] == self.TEXT:
     498#             output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
     499#         else:
     500#             output.append((self.TEXT, text, (None, -1, -1)))
     501#
     502#     def _format_fragment(self, text, fragment):
     503#         START, TEXT, END, Attrs = self.START, self.TEXT, self.END, self.Attrs
     504#         qname = self.qname
     505#         output = []
     506#        
     507#         index = fragment.startchar
     508#         lastmatched = False
     509#         for t in fragment.matches:
     510#             if t.startchar > index:
     511#                 if lastmatched:
     512#                     output.append((END, qname, (None, -1, -1)))
     513#                     lastmatched = False
     514#                 self._add_text(text[index:t.startchar], output)
     515#            
     516#             ttxt = text[t.startchar:t.endchar]
     517#             if not lastmatched:
     518#                 output.append((START, (qname, Attrs()), (None, -1, -1)))
     519#                 lastmatched = True
     520#             output.append((TEXT, ttxt, (None, -1, -1)))
     521#                                    
     522#             index = t.endchar
     523#        
     524#         if lastmatched:
     525#             output.append((END, qname, (None, -1, -1)))
     526#        
     527#         return output
     528#
     529#     def __call__(self, text, fragments):
     530#         output = []
     531#         first = True
     532#         for fragment in fragments:
     533#             if not first:
     534#                 self._add_text(self.between, output)
     535#             first = False
     536#             output += self._format_fragment(text, fragment)
     537#        
     538#         return self.Stream(output)
    491539
    492540
  • branches/mbutscher/work/lib/whoosh/lang/porter2.py

    r230 r234  
    1 # Copyright (c) 2008 Michael Dirolf (mike at dirolf dot com)
    2  
    3 # Permission is hereby granted, free of charge, to any person
    4 # obtaining a copy of this software and associated documentation
    5 # files (the "Software"), to deal in the Software without
    6 # restriction, including without limitation the rights to use,
    7 # copy, modify, merge, publish, distribute, sublicense, and/or sell
    8 # copies of the Software, and to permit persons to whom the
    9 # Software is furnished to do so, subject to the following
    10 # conditions:
    11  
    12 # The above copyright notice and this permission notice shall be
    13 # included in all copies or substantial portions of the Software.
    14  
    15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    16 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    17 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    18 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    19 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    20 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    22 # OTHER DEALINGS IN THE SOFTWARE.
    23  
    241"""An implementation of the Porter2 stemming algorithm.
    252See http://snowball.tartarus.org/algorithms/english/stemmer.html
     
    296This algorithm is more correct but (at least in this implementation)
    307several times slower than the original porter algorithm as implemented
    31 in whoosh.lang.porter.
     8in stemming.porter.
    329"""
    3310
     
    146123 
    147124def step_1c(word):
    148     if word.endswith('y') or word.endswith('Y'):
     125    if word.endswith('y') or word.endswith('Y') and len(word) > 1:
    149126        if word[-2] not in 'aeiouy':
    150127            if len(word) > 2:
     
    305282    return word
    306283
    307  
     284   
     285 
  • branches/mbutscher/work/lib/whoosh/matching.py

    r231 r234  
    252252   
    253253    def __init__(self, ids, weights=None, values=None, format=None,
    254                  scorer=None, position=0):
     254                 scorer=None, position=0, all_weights=None):
    255255        """
    256256        :param ids: a list of doc IDs.
     
    267267        self._ids = ids
    268268        self._weights = weights
     269        self._all_weights = all_weights
    269270        self._values = values
    270271        self._i = position
     
    319320   
    320321    def weight(self):
    321         if self._weights:
     322        if self._all_weights:
     323            return self._all_weights
     324        elif self._weights:
    322325            return self._weights[self._i]
    323326        else:
     
    10231026            child.skip_to(self._id)
    10241027       
    1025         while (self._id < self.limit
    1026                and ((child.is_active() and self._id == child.id())
    1027                     or missing(self._id))):
    1028             self._id += 1
    1029             if child.is_active():
     1028        # While self._id is missing or is in the child matcher, increase it
     1029        while child.is_active() and self._id < self.limit:
     1030            if missing(self._id):
     1031                self._id += 1
     1032                continue
     1033
     1034            if self._id == child.id():
     1035                self._id += 1
    10301036                child.next()
    1031    
     1037                continue
     1038           
     1039            break
     1040       
    10321041    def id(self):
    10331042        return self._id
  • branches/mbutscher/work/lib/whoosh/qparser/default.py

    r231 r234  
    2020"""
    2121
    22 import re
    23 
    2422from whoosh import query
    25 
    26 
    27 class QueryParserError(Exception):
    28     def __init__(self, cause, msg=None):
    29         super(QueryParserError, self).__init__(str(cause))
    30         self.cause = cause
    31 
    32 
    33 def rcompile(pattern, flags=0):
    34     if not isinstance(pattern, basestring):
    35         # If it's not a string, assume it's already a compiled pattern
    36         return pattern
    37     return re.compile(pattern, re.UNICODE | flags)
    38 
    39 
    40 def get_single_text(field, text, **kwargs):
    41     # Just take the first token
    42     for t in field.process_text(text, mode="query", **kwargs):
    43         return t
     23from whoosh.qparser.syntax import *
     24from whoosh.qparser.plugins import *
    4425
    4526
     
    4829
    4930
    50 class SyntaxObject(object):
    51     """An object representing parsed text. These objects generally correspond
    52     to a query object type, and are intermediate objects used to represent
    53     the syntax tree parsed from a query string, and then generate a query
    54     tree from the syntax tree. There will be syntax objects that do not have
    55     a corresponding query type, such as the object representing whitespace.
    56     """
    57    
    58     def query(self, parser):
    59         """Returns a query object tree representing this parser object.
    60         """
    61        
    62         raise NotImplementedError
    63 
    64 
    65 # Grouping objects
    66 
    67 class Group(SyntaxObject):
    68     """An object representing a group of objects. These generally correspond
    69     to compound query objects such as ``query.And`` and ``query.Or``.
    70     """
    71    
    72     def __init__(self, tokens=None, boost=1.0):
    73         if tokens:
    74             self.tokens = tokens
    75         else:
    76             self.tokens = []
    77         self.boost = boost
    78    
    79     def __repr__(self):
    80         r = "%s(%r)" % (self.__class__.__name__, self.tokens)
    81         if self.boost != 1.0:
    82             r += "^%s" % self.boost
    83         return r
    84    
    85     def __nonzero__(self):
    86         return bool(self.tokens)
    87    
    88     def __iter__(self):
    89         return iter(self.tokens)
    90    
    91     def __len__(self):
    92         return len(self.tokens)
    93    
    94     def __getitem__(self, n):
    95         return self.tokens.__getitem__(n)
    96    
    97     def __setitem__(self, n, v):
    98         self.tokens.__setitem__(n, v)
    99    
    100     def set_boost(self, b):
    101         return self.__class__(self.tokens[:], boost=b)
    102    
    103     def set_fieldname(self, name):
    104         return self.__class__([t.set_fieldname(name) for t in self.tokens])
    105    
    106     def append(self, item):
    107         self.tokens.append(item)
    108        
    109     def extend(self, items):
    110         self.tokens.extend(items)
    111    
    112     def pop(self):
    113         return self.tokens.pop()
    114    
    115     def query(self, parser):
    116         return self.qclass([t.query(parser) for t in self.tokens],
    117                            boost=self.boost)
    118        
    119     def empty(self):
    120         return self.__class__(boost=self.boost)
    121 
    122 
    123 class AndGroup(Group):
    124     """Syntax group corresponding to an And query.
    125     """
    126    
    127     qclass = query.And
    128 
    129 
    130 class OrGroup(Group):
    131     """Syntax group corresponding to an Or query.
    132     """
    133    
    134     qclass = query.Or
    135 
    136 
    137 class AndNotGroup(Group):
    138     """Syntax group corresponding to an AndNot query.
    139     """
    140    
    141     def query(self, parser):
    142         assert len(self.tokens) == 2
    143         return query.AndNot(self.tokens[0].query(parser),
    144                             self.tokens[1].query(parser), boost=self.boost)
    145    
    146 class AndMaybeGroup(Group):
    147     """Syntax group corresponding to an AndMaybe query.
    148     """
    149    
    150     def query(self, parser):
    151         assert len(self.tokens) == 2
    152         return query.AndMaybe(self.tokens[0].query(parser),
    153                               self.tokens[1].query(parser), boost=self.boost)
    154 
    155 
    156 class DisMaxGroup(Group):
    157     """Syntax group corresponding to a DisjunctionMax query.
    158     """
    159    
    160     def __init__(self, tokens=None, tiebreak=0.0, boost=None):
    161         super(DisMaxGroup, self).__init__(tokens)
    162         self.tiebreak = tiebreak
    163    
    164     def __repr__(self):
    165         r = "dismax(%r" % self.tokens
    166         if self.tiebreak != 0:
    167             r += " tb=%s" % self.tiebreak
    168         r += ")"
    169         return r
    170    
    171     def query(self, parser):
    172         return query.DisjunctionMax([t.query(parser) for t in self.tokens],
    173                                     tiebreak=self.tiebreak)
    174        
    175     def empty(self):
    176         return self.__class__(tiebreak=self.tiebreak)
    177 
    178 
    179 class NotGroup(Group):
    180     """Syntax group corresponding to a Not query.
    181     """
    182    
    183     def __repr__(self):
    184         return "NOT(%r)" % self.tokens
    185    
    186     def query(self, parser):
    187         assert len(self.tokens) == 1
    188         return query.Not(self.tokens[0].query(parser))
    189    
    190 
    191 # Parse-able tokens
    192 
    193 class Token(SyntaxObject):
    194     """A parse-able token object. Each token class has an ``expr`` attribute
    195     containing a regular expression that matches the token text. When this
    196     expression is found, the class's ``create()`` class method is called and
    197     returns a token object to represent the match in the syntax tree. When the
    198     syntax tree is finished, the
    199     """
    200    
    201     fieldname = None
    202     endpos = None
    203    
    204     def set_boost(self, b):
    205         return self
    206    
    207     def set_fieldname(self, name):
    208         return self
    209    
    210     @classmethod
    211     def match(cls, text, pos):
    212         return cls.expr.match(text, pos)
    213    
    214     @classmethod
    215     def create(cls, parser, match):
    216         return cls()
    217    
    218     def query(self, parser):
    219         raise NotImplementedError
    220 
    221 
    222 class Singleton(Token):
    223     """Base class for tokens that don't carry any information specific to
    224     each instance (e.g. "open paranthesis" token), so they can all share the
    225     same instance.
    226     """
    227    
    228     me = None
    229    
    230     def __repr__(self):
    231         return self.__class__.__name__
    232    
    233     @classmethod
    234     def create(cls, parser, match):
    235         if not cls.me:
    236             cls.me = cls()
    237         return cls.me
    238 
    239 
    240 class White(Singleton):
    241     expr = rcompile("\\s+")
    242    
    243 
    244 class ErrorToken(Token):
    245     """A token representing an unavoidable parsing error. The ``query()``
    246     method always returns NullQuery.
    247    
    248     The default parser usually does not produce "errors" (text that doesn't
    249     match the syntax is simply treated as part of the query), so this is mostly
    250     for use by plugins that may add more restrictive parsing, for example
    251     :class:`DateParserPlugin`.
    252    
    253     Since the corresponding NullQuery will be filtered out when the query is
    254     normalized, this is really only useful for debugging and possibly for
    255     plugin filters.
    256    
    257     The ``token`` attribute may contain the token that produced the error.
    258     """
    259    
    260     def __init__(self, token):
    261         self.token = token
    262        
    263     def __repr__(self):
    264         return "<%s (%r)>" % (self.__class__.__name__, self.token)
    265    
    266     def query(self, parser):
    267         return query.NullQuery
    268 
    269 
    270 class BasicSyntax(Token):
    271     """Base class for "basic" (atomic) syntax -- term, prefix, wildcard,
    272     phrase, range.
    273     """
    274    
    275     expr = None
    276     qclass = None
    277     tokenize = False
    278     removestops = False
    279    
    280     def __init__(self, text, fieldname=None, boost=1.0):
    281         self.fieldname = fieldname
    282         self.text = text
    283         self.boost = boost
    284    
    285     def set_boost(self, b):
    286         return self.__class__(self.text, fieldname=self.fieldname, boost=b)
    287    
    288     def set_fieldname(self, name):
    289         if self.fieldname is None:
    290             return self.__class__(self.text, fieldname=name, boost=self.boost)
    291         else:
    292             return self
    293    
    294     def __repr__(self):
    295         r = "%s:%r" % (self.fieldname, self.text)
    296         if self.boost != 1.0:
    297             r += "^%s" % self.boost
    298         return r
    299    
    300     @classmethod
    301     def create(cls, parser, match):
    302         return cls(match.group(0))
    303    
    304     def query(self, parser):
    305         texts = (self.text, )
    306         fieldname = self.fieldname or parser.fieldname
    307         cls = self.qclass or parser.termclass
    308        
    309         if parser.schema and fieldname in parser.schema:
    310             field = parser.schema[fieldname]
    311            
    312             if field.self_parsing():
    313                 try:
    314                     return field.parse_query(fieldname, self.text,
    315                                              boost=self.boost)
    316                 except QueryParserError:
    317                     return query.NullQuery
    318            
    319             texts = list(field.process_text(self.text, mode="query",
    320                                             tokenize=self.tokenize,
    321                                             removestops=self.removestops))
    322        
    323         if len(texts) > 1:
    324             compound = parser.group.qclass
    325             return compound([cls(fieldname, t, boost=self.boost)
    326                              for t in texts])
    327         elif texts and texts[0] is not None:
    328             return cls(fieldname, texts[0], boost=self.boost)
    329         else:
    330             return query.NullQuery
    331 
    332 
    333 class Word(BasicSyntax):
    334     """Syntax object representing a term.
    335     """
    336    
    337     expr = rcompile("[^ \t\r\n)]+")
    338     tokenize = True
    339     removestops = True
    340    
    341 
    342 # Parser plugins
    343 
    344 class Plugin(object):
    345     """Base class for parser plugins.
    346     """
    347            
    348     def tokens(self, parser):
    349         """Returns a list of ``(token_class, priority)`` tuples to add to the
    350         syntax the parser understands.
    351         """
    352        
    353         return ()
    354    
    355     def filters(self, parser):
    356         """Returns a list of ``(filter_function, priority)`` tuples to add to
    357         parser.
    358         """
    359        
    360         return ()
    361    
    362 
    363 class RangePlugin(Plugin):
    364     """Adds the ability to specify term ranges.
    365    
    366     This plugin has no configuration.
    367    
    368     This plugin is included in the default parser configuration.
    369     """
    370    
    371     def tokens(self, parser):
    372         return ((RangePlugin.Range, 1), )
    373    
    374     class Range(Token):
    375         expr = rcompile(r"""
    376         (?P<open>\{|\[)               # Open paren
    377        
    378         (                             # Begin optional "start"
    379           (                           # Begin choice between start1 and start2
    380             ('(?P<start2>[^']+)')     # Quoted start
    381             | (?P<start1>[^ ]+)       # ...or regular start
    382           )                           # End choice
    383         [ ]+)?                        # Space at end of optional "start"
    384        
    385         [Tt][Oo]                      # "to" between start and end
    386        
    387         ([ ]+                         # Space at start of optional "end"
    388           (                           # Begin choice between end1 and end2
    389             ('(?P<end2>[^']+)')       # Quoted end
    390             | (?P<end1>[^\]\}]*)      # ...or normal end
    391           )                           # End choice
    392         )?                            # End of optional "end
    393        
    394         (?P<close>\}|\])              # Close paren
    395         """, re.VERBOSE)
    396        
    397         def __init__(self, start, end, startexcl, endexcl, fieldname=None, boost=1.0):
    398             self.fieldname = fieldname
    399             self.start = start
    400             self.end = end
    401             self.startexcl = startexcl
    402             self.endexcl = endexcl
    403             self.boost = boost
    404        
    405         def set_boost(self, b):
    406             return self.__class__(self.start, self.end, self.startexcl,
    407                                   self.endexcl, fieldname=self.fieldname,
    408                                   boost=b)
    409        
    410         def set_fieldname(self, name):
    411             return self.__class__(self.start, self.end, self.startexcl,
    412                                   self.endexcl, fieldname=name,
    413                                   boost=self.boost)
    414        
    415         def __repr__(self):
    416             r = "%s:(%r, %r, %s, %s)" % (self.fieldname, self.start, self.end,
    417                                          self.startexcl, self.endexcl)
    418             if self.boost != 1.0:
    419                 r += "^%s" % self.boost
    420             return r
    421        
    422         @classmethod
    423         def create(cls, parser, match):
    424             start = match.group("start2") or match.group("start1")
    425             end = match.group("end2") or match.group("end1")
    426             return cls(start, end, startexcl=match.group("open") == "{",
    427                        endexcl=match.group("close") == "}")
    428            
    429         def query(self, parser):
    430             fieldname = self.fieldname or parser.fieldname
    431             start, end = self.start, self.end
    432             if parser.schema and fieldname in parser.schema:
    433                 field = parser.schema[fieldname]
    434                
    435                 if field.self_parsing():
    436                     try:
    437                         rangeq = field.parse_range(fieldname, start, end,
    438                                                    self.startexcl, self.endexcl,
    439                                                    boost=self.boost)
    440                         if rangeq is not None:
    441                             return rangeq
    442                     except QueryParserError, e:
    443                         return query.NullQuery
    444                
    445                 if start:
    446                     start = get_single_text(field, start, tokenize=False,
    447                                             removestops=False)
    448                 if end:
    449                     end = get_single_text(field, end, tokenize=False,
    450                                           removestops=False)
    451            
    452             if start is None:
    453                 start = u''
    454             if end is None:
    455                 end = u'\uFFFF'
    456            
    457             return query.TermRange(fieldname, start, end, self.startexcl,
    458                                    self.endexcl, boost=self.boost)
    459            
    460 
    461 class PhrasePlugin(Plugin):
    462     """Adds the ability to specify phrase queries inside double quotes.
    463    
    464     This plugin has no configuration.
    465    
    466     This plugin is included in the default parser configuration.
    467     """
    468    
    469     def tokens(self, parser):
    470         return ((PhrasePlugin.Quotes, 0), )
    471    
    472     class Quotes(BasicSyntax):
    473         expr = rcompile('"(.*?)"')
    474        
    475         def __init__(self, text, fieldname=None, boost=1.0, slop=1):
    476             super(PhrasePlugin.Quotes, self).__init__(text, fieldname=fieldname,
    477                                                       boost=boost)
    478             self.slop = slop
    479        
    480         def __repr__(self):
    481             r = "%s:q(%r)" % (self.fieldname, self.text)
    482             if self.boost != 1.0:
    483                 r += "^%s" % self.boost
    484             return r
    485        
    486         @classmethod
    487         def create(cls, parser, match):
    488             slop = 1
    489             #if match.group(5):
    490             #    try:
    491             #        slop = int(match.group(5))
    492             #    except ValueError:
    493             #        pass
    494             return cls(match.group(1), slop=slop)
    495        
    496         def query(self, parser):
    497             fieldname = self.fieldname or parser.fieldname
    498             if parser.schema and fieldname in parser.schema:
    499                 field = parser.schema[fieldname]
    500                 #if field.self_parsing():
    501                 #    return field.parse_query(fieldname, self.text, boost=self.boost)
    502                 #else:
    503                 words = list(field.process_text(self.text, mode="query"))
    504             else:
    505                 words = self.text.split(" ")
    506            
    507             return parser.phraseclass(fieldname, words, boost=self.boost,
    508                                       slop=self.slop)
    509 
    510 
    511 class SingleQuotesPlugin(Plugin):
    512     """Adds the ability to specify single "terms" containing spaces by
    513     enclosing them in single quotes.
    514    
    515     This plugin has no configuration.
    516    
    517     This plugin is included in the default parser configuration.
    518     """
    519      
    520     def tokens(self, parser):
    521         return ((SingleQuotesPlugin.SingleQuotes, 0), )
    522    
    523     class SingleQuotes(Token):
    524         expr = rcompile(r"(^|(?<=\W))'(.*?)'(?=\s|\]|[)}]|$)")
    525        
    526         @classmethod
    527         def create(cls, parser, match):
    528             return Word(match.group(2))
    529 
    530 
    531 class PrefixPlugin(Plugin):
    532     """Adds the ability to specify prefix queries by ending a term with an
    533     asterisk. This plugin is useful if you want the user to be able to create
    534     prefix but not wildcard queries (for performance reasons). If you are
    535     including the wildcard plugin, you should not include this plugin as well.
    536     """
    537    
    538     def tokens(self, parser):
    539         return ((PrefixPlugin.Prefix, 0), )
    540    
    541     class Prefix(BasicSyntax):
    542         expr = rcompile("[^ \t\r\n*]+\\*(?= |$|\\))")
    543         qclass = query.Prefix
    544        
    545         def __repr__(self):
    546             r = "%s:pre(%r)" % (self.fieldname, self.text)
    547             if self.boost != 1.0:
    548                 r += "^%s" % self.boost
    549             return r
    550        
    551         @classmethod
    552         def create(cls, parser, match):
    553             return cls(match.group(0)[:-1])
    554        
    555 
    556 class WildcardPlugin(Plugin):
    557     """Adds the ability to specify wildcard queries by using asterisk and
    558     question mark characters in terms. Note that these types can be very
    559     performance and memory intensive. You may consider not including this
    560     type of query.
    561    
    562     This plugin is included in the default parser configuration.
    563     """
    564    
    565     def tokens(self, parser):
    566         return ((WildcardPlugin.Wild, 1), )
    567    
    568     class Wild(BasicSyntax):
    569         # \u055E = Armenian question mark
    570         # \u061F = Arabic question mark
    571         # \u1367 = Ethiopic question mark
    572         expr = rcompile(u"[^ \t\r\n*?\u055E\u061F\u1367]*[*?\u055E\u061F\u1367]\\S*")
    573         qclass = query.Wildcard
    574        
    575         def __repr__(self):
    576             r = "%s:wild(%r)" % (self.fieldname, self.text)
    577             if self.boost != 1.0:
    578                 r += "^%s" % self.boost
    579             return r
    580        
    581         @classmethod
    582         def create(cls, parser, match):
    583             return cls(match.group(0))
    584        
    585 
    586 class WhitespacePlugin(Plugin):
    587     """Parses whitespace between words in the query string. You should always
    588     include this plugin.
    589    
    590     This plugin is always automatically included by the QueryParser.
    591     """
    592    
    593     def __init__(self, tokenclass=White):
    594         self.tokenclass = tokenclass
    595    
    596     def tokens(self, parser):
    597         return ((self.tokenclass, 100), )
    598    
    599     def filters(self, parser):
    600         return ((self.do_whitespace, 500), )
    601    
    602     def do_whitespace(self, parser, stream):
    603         newstream = stream.empty()
    604         for t in stream:
    605             if isinstance(t, Group):
    606                 newstream.append(self.do_whitespace(parser, t))
    607             elif not isinstance(t, self.tokenclass):
    608                 newstream.append(t)
    609         return newstream
    610 
    611 
    612 class GroupPlugin(Plugin):
    613     """Adds the ability to group clauses using parentheses.
    614    
    615     This plugin is included in the default parser configuration.
    616     """
    617    
    618     def tokens(self, parser):
    619         return ((GroupPlugin.Open, 0), (GroupPlugin.Close, 0))
    620    
    621     def filters(self, parser):
    622         return ((GroupPlugin.do_groups, 0), )
    623    
    624     @staticmethod
    625     def do_groups(parser, stream):
    626         stack = [parser.group()]
    627         for t in stream:
    628             if isinstance(t, GroupPlugin.Open):
    629                 stack.append(parser.group())
    630             elif isinstance(t, GroupPlugin.Close):
    631                 if len(stack) > 1:
    632                     last = stack.pop()
    633                     stack[-1].append(last)
    634             else:
    635                 stack[-1].append(t)
    636        
    637         top = stack[0]
    638         if len(stack) > 1:
    639             for ls in stack[1:]:
    640                 top.extend(ls)
    641        
    642         if len(top) == 1 and isinstance(top[0], Group):
    643             top = top[0].set_boost(top.boost)
    644        
    645         return top
    646    
    647     class Open(Singleton):
    648         expr = rcompile("\\(")
    649        
    650     class Close(Singleton):
    651         expr = rcompile("\\)")
    652 
    653 
    654 class FieldsPlugin(Plugin):
    655     """Adds the ability to specify the field of a clause using a colon.
    656    
    657     This plugin is included in the default parser configuration.
    658     """
    659    
    660     def tokens(self, parser):
    661         return ((FieldsPlugin.Field, 0), )
    662    
    663     def filters(self, parser):
    664         return ((FieldsPlugin.do_fieldnames, 100), )
    665 
    666     @staticmethod
    667     def do_fieldnames(parser, stream):
    668         newstream = stream.empty()
    669         newname = None
    670         for i, t in enumerate(stream):
    671             if isinstance(t, FieldsPlugin.Field):
    672                 valid = False
    673                 if i < len(stream) - 1:
    674                     next = stream[i+1]
    675                     if not isinstance(next, (White, FieldsPlugin.Field)):
    676                         newname = t.fieldname
    677                         valid = True
    678                 if not valid:
    679                     newstream.append(Word(t.fieldname, fieldname=parser.fieldname))
    680                 continue
    681            
    682             if isinstance(t, Group):
    683                 t = FieldsPlugin.do_fieldnames(parser, t)
    684                
    685             if newname is not None:
    686                 t = t.set_fieldname(newname)
    687             newstream.append(t)
    688             newname = None
    689        
    690         return newstream
    691    
    692     class Field(Token):
    693         expr = rcompile(u"(\w[\w\d]*):")
    694        
    695         def __init__(self, fieldname):
    696             self.fieldname = fieldname
    697        
    698         def __repr__(self):
    699             return "<%s:>" % self.fieldname
    700        
    701         def set_fieldname(self, fieldname):
    702             return self.__class__(fieldname)
    703        
    704         @classmethod
    705         def create(cls, parser, match):
    706             fieldname = match.group(1)
    707             if not parser.schema or (fieldname in parser.schema):
    708                 return cls(fieldname)
    709    
    710 
    711 class CompoundsPlugin(Plugin):
    712     """Adds the ability to use AND, OR, ANDMAYBE, and ANDNOT to specify
    713     query constraints.
    714    
    715     You can customize the tokens by passing regular expressions to the ``And``,
    716     ``Or``, ``AndNot``, and/or ``AndMaybe`` keywords to the class initializer::
    717    
    718         qp = qparser.QueryParser("content")
    719        
    720         cp = qparser.CompoundsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~")
    721         qp.replace_plugin(cp)
    722    
    723     This plugin is included in the default parser configuration.
    724     """
    725    
    726     def __init__(self, And=r"\sAND\s", Or=r"\sOR\s", AndNot=r"\sANDNOT\s",
    727                  AndMaybe=r"\sANDMAYBE\s"):
    728         # Create one-off token classes using the keyword arguments
    729         class AndTokenClass(Singleton):
    730             expr = rcompile(And)
    731         class OrTokenClass(Singleton):
    732             expr = rcompile(Or)
    733         class AndNotTokenClass(Singleton):
    734             expr = rcompile(AndNot)
    735         class AndMaybeTokenClass(Singleton):
    736             expr = rcompile(AndMaybe)
    737            
    738         # Store these classes as attributes
    739         self.And = AndTokenClass
    740         self.Or = OrTokenClass
    741         self.AndNot = AndNotTokenClass
    742         self.AndMaybe = AndMaybeTokenClass
    743    
    744     def tokens(self, parser):
    745         return ((self.AndNot, -10), (self.AndMaybe, -5), (self.And, 0),
    746                 (self.Or, 0))
    747    
    748     def filters(self, parser):
    749         return ((self.do_compounds, 600), )
    750 
    751     def do_compounds(self, parser, stream):
    752         newstream = stream.empty()
    753         i = 0
    754         while i < len(stream):
    755             # The current token
    756             t = stream[i]
    757            
    758             # Whether this token has other tokens in front and behind; that is,
    759             # if ismiddle is True, this is not the first or last token
    760             ismiddle = newstream and i < len(stream) - 1
    761            
    762             if isinstance(t, Group):
    763                 # The current token is a group: recursively apply this plugin
    764                 # to the group
    765                 newstream.append(self.do_compounds(parser, t))
    766                
    767             elif isinstance(t, (self.And, self.Or)):
    768                 # This is either an And or Or token. Create a new Group class
    769                 # of the appropriate type
    770                 if isinstance(t, self.And):
    771                     cls = AndGroup
    772                 else:
    773                     cls = OrGroup
    774                
    775                 if cls != type(newstream) and ismiddle:
    776                     last = newstream.pop()
    777                     rest = self.do_compounds(parser, cls(stream[i+1:]))
    778                     newstream.append(cls([last, rest]))
    779                     break
    780            
    781             elif isinstance(t, (self.AndNot, self.AndMaybe)) and ismiddle:
    782                 # This is either an AndNot or AndMaybe token. Create a new
    783                 # Group class of the appropriate type
    784                 if isinstance(t, self.AndNot):
    785                     cls = AndNotGroup
    786                 else:
    787                     cls = AndMaybeGroup
    788                
    789                 last = newstream.pop()
    790                 i += 1
    791                 next = stream[i]
    792                 if isinstance(next, Group):
    793                     next = self.do_compounds(parser, next)
    794                 newstream.append(cls([last, next]))
    795            
    796             else:
    797                 newstream.append(t)
    798            
    799             i += 1
    800        
    801         return newstream
    802 
    803 
    804 class BoostPlugin(Plugin):
    805     """Adds the ability to boost clauses of the query using the circumflex.
    806    
    807     This plugin is included in the default parser configuration.
    808     """
    809    
    810     def tokens(self, parser):
    811         return ((BoostPlugin.Boost, 0), )
    812    
    813     def filters(self, parser):
    814         return ((BoostPlugin.clean_boost, 0), (BoostPlugin.do_boost, 700))
    815 
    816     @staticmethod
    817     def clean_boost(parser, stream):
    818         newstream = stream.empty()
    819         for i, t in enumerate(stream):
    820             if isinstance(t, BoostPlugin.Boost):
    821                 if i == 0 or isinstance(stream[i-1], (BoostPlugin.Boost, White)):
    822                     t = Word(t.original)
    823             newstream.append(t)
    824         return newstream
    825 
    826     @staticmethod
    827     def do_boost(parser, stream):
    828         newstream = stream.empty()
    829        
    830         for t in stream:
    831             if isinstance(t, Group):
    832                 newstream.append(BoostPlugin.do_boost(parser, t))
    833                
    834             elif isinstance(t, BoostPlugin.Boost):
    835                 if newstream:
    836                     newstream.append(newstream.pop().set_boost(t.boost))
    837                
    838             else:
    839                 newstream.append(t)
    840        
    841         return newstream
    842    
    843     class Boost(Token):
    844         expr = rcompile("\\^([0-9]+(.[0-9]+)?)($|(?=[ \t\r\n]))")
    845        
    846         def __init__(self, original, boost):
    847             self.original = original
    848             self.boost = boost
    849        
    850         def __repr__(self):
    851             return "<^%s>" % self.boost
    852        
    853         @classmethod
    854         def create(cls, parser, match):
    855             try:
    856                 return cls(match.group(0), float(match.group(1)))
    857             except ValueError:
    858                 return Word(match.group(0))
    859    
    860 
    861 class NotPlugin(Plugin):
    862     """Adds the ability to negate a clause by preceding it with NOT.
    863    
    864     You can customize the token by passing a regular expression to the class
    865     initializer::
    866    
    867         qp = qparser.QueryParser("content")
    868        
    869         # Use - as the not token
    870         qp.replace_plugin(qparser.NotPlugin("(^|(?<= ))-"))
    871        
    872         # Use ! as the not token
    873         qp.replace_plugin(qparser.NotPlugin("(^|(?<= ))!"))
    874    
    875     This plugin is included in the default parser configuration.
    876     """
    877    
    878     def __init__(self, token="(^|(?<= ))NOT "):
    879         class Not(Singleton):
    880             expr = rcompile(token)
    881        
    882         self.Not = Not
    883    
    884     def tokens(self, parser):
    885         return ((self.Not, 0), )
    886    
    887     def filters(self, parser):
    888         return ((self.do_not, 800), )
    889    
    890     def do_not(self, parser, stream):
    891         newstream = stream.empty()
    892         notnext = False
    893         for t in stream:
    894             if isinstance(t, self.Not):
    895                 notnext = True
    896                 continue
    897            
    898             if isinstance(t, Group):
    899                 t = self.do_not(parser, t)
    900            
    901             if notnext:
    902                 t = NotGroup([t])
    903            
    904             newstream.append(t)
    905             notnext = False
    906            
    907         return newstream
    908    
    909 
    910 class PlusMinusPlugin(Plugin):
    911     """Adds the ability to use + and - in a flat OR query to specify required
    912     and prohibited terms.
    913    
    914     This is the basis for the parser configuration returned by
    915     ``SimpleParser()``.
    916     """
    917    
    918     def tokens(self, parser):
    919         return ((PlusMinusPlugin.Plus, 0), (PlusMinusPlugin.Minus, 0))
    920    
    921     def filters(self, parser):
    922         return ((PlusMinusPlugin.do_plusminus, 510), )
    923    
    924     @staticmethod
    925     def do_plusminus(parser, stream):
    926         required = AndGroup()
    927         optional = OrGroup()
    928         prohibited = OrGroup()
    929        
    930         nextlist = optional
    931         for t in stream:
    932             if isinstance(t, PlusMinusPlugin.Plus):
    933                 nextlist = required
    934             elif isinstance(t, PlusMinusPlugin.Minus):
    935                 nextlist = prohibited
    936             else:
    937                 nextlist.append(t)
    938                 nextlist = optional
    939        
    940         r = optional
    941         if required:
    942             r = AndMaybeGroup([required, optional])
    943         if prohibited:
    944             r = AndNotGroup([r, prohibited])
    945         return r
    946    
    947     class Plus(Singleton):
    948         expr = rcompile("\\+")
    949        
    950     class Minus(Singleton):
    951         expr = rcompile("-")
    952 
    953 
    954 class MultifieldPlugin(Plugin):
    955     """Converts any unfielded terms into OR clauses that search for the
    956     term in a specified list of fields.
    957     """
    958    
    959     def __init__(self, fieldnames, fieldboosts=None):
    960         """
    961         :param fieldnames: a list of fields to search.
    962         :param fieldboosts: an optional dictionary mapping field names to
    963             a boost to use for that field.
    964         """
    965        
    966         self.fieldnames = fieldnames
    967         self.boosts = fieldboosts or {}
    968    
    969     def filters(self, parser):
    970         return ((self.do_multifield, 110), )
    971    
    972     def do_multifield(self, parser, stream):
    973         newstream = stream.empty()
    974         for t in stream:
    975             if isinstance(t, BasicSyntax) and t.fieldname is None:
    976                 t = OrGroup([t.set_fieldname(fn).set_boost(self.boosts.get(fn, 1.0))
    977                              for fn in self.fieldnames])
    978             newstream.append(t)
    979         return newstream
    980        
    981 
    982 class DisMaxPlugin(Plugin):
    983     """Converts any unfielded terms into DisjunctionMax clauses that search
    984     for the term in a specified list of fields.
    985     """
    986    
    987     def __init__(self, fieldboosts, tiebreak=0.0):
    988         """
    989         :param fieldboosts: a dictionary mapping field names to a boost to use
    990             for that in the DisjuctionMax query.
    991         """
    992        
    993         self.fieldboosts = fieldboosts.items()
    994         self.tiebreak = tiebreak
    995    
    996     def filters(self, parser):
    997         return ((self.do_dismax, 110), )
    998    
    999     def do_dismax(self, parser, stream):
    1000         newstream = stream.empty()
    1001         for t in stream:
    1002             if isinstance(t, BasicSyntax) and t.fieldname is None:
    1003                 t = DisMaxGroup([t.set_fieldname(fn).set_boost(b)
    1004                                  for fn, b in self.fieldboosts],
    1005                                  tiebreak=self.tiebreak)
    1006             newstream.append(t)
    1007         return newstream
    1008 
    1009 
    1010 class FieldAliasPlugin(Plugin):
    1011     """Adds the ability to use "aliases" of fields in the query string.
    1012    
    1013     >>> # Allow users to use 'body' or 'text' to refer to the 'content' field
    1014     >>> parser.add_plugin(FieldAliasPlugin({"content": ("body", "text")}))
    1015     >>> parser.parse("text:hello")
    1016     Term("content", "hello")
    1017     """
    1018    
    1019     def __init__(self, fieldmap):
    1020         """
    1021         :param fieldmap: a dictionary mapping fieldnames to a list of
    1022             aliases for the field.
    1023         """
    1024        
    1025         self.fieldmap = fieldmap
    1026         self.reverse = {}
    1027         for key, values in fieldmap.iteritems():
    1028             for value in values:
    1029                 self.reverse[value] = key
    1030        
    1031     def filters(self, parser):
    1032         return ((self.do_aliases, 90), )
    1033    
    1034     def do_aliases(self, parser, stream):
    1035         newstream = stream.empty()
    1036         for t in stream:
    1037             if (not isinstance(t, Group)
    1038                   and t.fieldname is not None
    1039                   and t.fieldname in self.reverse):
    1040                     t = t.set_fieldname(self.reverse[t.fieldname])
    1041             newstream.append(t)
    1042         return newstream
    1043 
    1044 
    1045 # Parser object
    1046 
    1047 full_profile = (BoostPlugin, CompoundsPlugin, FieldsPlugin, GroupPlugin,
    1048                 NotPlugin, PhrasePlugin, RangePlugin, SingleQuotesPlugin,
    1049                 WildcardPlugin)
     31full_profile = (BoostPlugin, OperatorsPlugin, FieldsPlugin, GroupPlugin,
     32                PhrasePlugin, RangePlugin, SingleQuotesPlugin, WildcardPlugin)
    105033
    105134
     
    106548    And([Term("content", u"hello"), Term("content", u"there")])
    106649    """
     50   
     51    _multitoken_query_map = {"and": query.And, "or": query.Or,
     52                             "phrase": query.Phrase}
    106753   
    106854    def __init__(self, fieldname, schema=None, termclass=query.Term,
     
    1152138        return [item for item, pri in items_and_priorities]
    1153139   
     140    def multitoken_query(self, name, texts, fieldname, termclass, boost):
     141        qclass = self._multitoken_query_map.get(name.lower())
     142        if qclass:
     143            return qclass([termclass(fieldname, t, boost=boost)
     144                           for t in texts])
     145   
     146    def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True,
     147                   removestops=True):
     148        """Returns the appropriate query object for a single term in the query
     149        string.
     150        """
     151       
     152        if self.schema and fieldname in self.schema:
     153            field = self.schema[fieldname]
     154           
     155            # If this field type wants to parse queries itself, let it do so
     156            # and return early
     157            if field.self_parsing():
     158                try:
     159                    return field.parse_query(fieldname, text, boost=boost)
     160                except QueryParserError:
     161                    return query.NullQuery
     162           
     163            # Otherwise, ask the field to process the text into a list of
     164            # tokenized strings
     165            texts = list(field.process_text(text, mode="query",
     166                                            tokenize=tokenize,
     167                                            removestops=removestops))
     168           
     169            # If the analyzer returned more than one token, use the field's
     170            # multitoken_query attribute to decide what query class, if any, to
     171            # use to put the tokens together
     172            if len(texts) > 1:
     173                mtq = self.multitoken_query(field.multitoken_query, texts,
     174                                            fieldname, termclass, boost)
     175                if mtq:
     176                    return mtq
     177               
     178            # It's possible field.process_text() will return an empty list (for
     179            # example, on a stop word)
     180            if not texts:
     181                return query.NullQuery
     182           
     183            text = texts[0]
     184       
     185        return termclass(fieldname, text, boost=boost)
     186       
    1154187    def tokens(self):
    1155188        """Returns a priorized list of tokens from the included plugins.
  • branches/mbutscher/work/lib/whoosh/query.py

    r230 r234  
    752752
    753753
    754 _wildcard_exp = re.compile("(.*?)([?*]|$)");
    755754class Wildcard(MultiTerm):
    756755    """Matches documents that contain any terms that match a wildcard
     
    862861        """
    863862
    864         if not text:
    865             raise QueryError("Fuzzy term is empty")
    866 
    867863        self.fieldname = fieldname
    868864        self.text = text
     
    881877
    882878    def __repr__(self):
    883         return "%s(%r, %r, ratio=%f)" % (self.__class__.__name__,
    884                                         self.fieldname, self.text,
    885                                          self.ratio)
     879        r = "%s(%r, %r, boost=%f, minsimilarity=%f, prefixlength=%d)"
     880        return r % (self.__class__.__name__, self.fieldname, self.text,
     881                    self.boost, self.minsimilarity, self.prefixlength)
    886882
    887883    def __unicode__(self):
    888         return u"~" + self.text
     884        r = u"~" + self.text
     885        if self.boost != 1.0:
     886            r += "^%f" % self.boost
     887        return r
    889888
    890889    def copy(self):
     
    982981
    983982    def replace(self, oldtext, newtext):
    984         if self.start == oldtext:
    985             return TermRange(self.fieldname, newtext, self.end,
    986                              self.startexcl, self.endexcl, boost=self.boost)
    987         elif self.end == oldtext:
    988             return TermRange(self.fieldname, self.start, newtext,
    989                              self.startexcl, self.endexcl, boost=self.boost)
    990         else:
    991             return self
    992 
     983        start = newtext if self.start == oldtext else self.start
     984        end = newtext if self.end == oldtext else self.end
     985        return self.__class__(self.fieldname, start, end, self.startexcl,
     986                              self.endexcl, boost=self.boost)
     987       
    993988    def _words(self, ixreader):
    994989        fieldname = self.fieldname
     
    12061201
    12071202
    1208 class Phrase(MultiTerm):
     1203class Phrase(Query):
    12091204    """Matches documents containing a given phrase."""
    12101205
     
    13091304
    13101305
     1306class Ordered(And):
     1307    """Matches documents containing a list of sub-queries in the given order.
     1308    """
     1309
     1310    JOINT = " BEFORE "
     1311   
     1312    def matcher(self, searcher, exclude_docs=None):
     1313        from spans import SpanBefore
     1314        return self._matcher(SpanBefore.SpanBeforeMatcher, searcher,
     1315                             exclude_docs=exclude_docs)
     1316
     1317
    13111318class Every(Query):
    13121319    """A query that matches every document containing any word in a given
     
    13401347    def matcher(self, searcher, exclude_docs=None):
    13411348        fieldname = self.fieldname
    1342         s = set()
    13431349       
    13441350        # This is a hacky hack, but just create an in-memory set of all the
    13451351        # document numbers of every term in the field
    1346         for text in searcher.lexicon(fieldname):
    1347             pr = searcher.postings(fieldname, text)
    1348             s.update(pr.all_ids())
     1352        s = set()
     1353       
     1354        if fieldname == "*":
     1355            s.update(xrange(searcher.doc_count_all()))
     1356        else:
     1357            for text in searcher.lexicon(fieldname):
     1358                pr = searcher.postings(fieldname, text)
     1359                s.update(pr.all_ids())
     1360       
    13491361        if exclude_docs:
    13501362            s.difference_update(exclude_docs)
    13511363       
    1352         return ListMatcher(sorted(s), weights=[self.boost] * len(s))
     1364        return ListMatcher(sorted(s), all_weights=self.boost)
    13531365
    13541366           
     
    13941406        else:
    13951407            ids = array("I", m.all_ids())
    1396             return ListMatcher(ids, weights=[self.score] * len(ids))
     1408            return ListMatcher(ids, all_weights=self.score)
    13971409       
    13981410    def replace(self, oldtext, newtext):
     
    14511463
    14521464
    1453 class Require(CompoundQuery):
     1465class BinaryQuery(CompoundQuery):
     1466    """Base class for binary queries (queries which are composed of two
     1467    sub-queries). Subclasses should set the ``matcherclass`` attribute or
     1468    override ``matcher()``, and may also need to override ``normalize()``,
     1469    ``estimate_size()``, and/or ``estimate_min_size()``.
     1470    """
     1471   
     1472    def __init__(self, a, b, boost=1.0):
     1473        self.a = a
     1474        self.b = b
     1475        self.subqueries = (a, b)
     1476        self.boost = boost
     1477       
     1478    def copy(self):
     1479        return self.__class__(self.a, self.b, boost=self.boost)
     1480   
     1481    def normalize(self):
     1482        a = self.a.normalize()
     1483        b = self.b.normalize()
     1484        if a is NullQuery and b is NullQuery:
     1485            return NullQuery
     1486        elif a is NullQuery:
     1487            return b
     1488        elif b is NullQuery:
     1489            return a
     1490   
     1491        return self.__class__(a, b, boost=self.boost)
     1492   
     1493    def matcher(self, searcher, exclude_docs=None):
     1494        return self.matcherclass(self.a.matcher(searcher, exclude_docs=exclude_docs),
     1495                                 self.b.matcher(searcher, exclude_docs=exclude_docs))
     1496
     1497
     1498class Require(BinaryQuery):
    14541499    """Binary query returns results from the first query that also appear in
    14551500    the second query, but only uses the scores from the first query. This lets
     
    14581503
    14591504    JOINT = " REQUIRE "
    1460 
    1461     def __init__(self, scoredquery, requiredquery, boost=1.0):
    1462         """
    1463         :param scoredquery: The query that is scored. Only documents that also
    1464             appear in the second query ('requiredquery') are scored.
    1465         :param requiredquery: Only documents that match both 'scoredquery' and
    1466             'requiredquery' are returned, but this query does not
    1467             contribute to the scoring.
    1468         """
    1469 
    1470         # The superclass CompoundQuery expects the subqueries to be in a
    1471         # sequence in self.subqueries
    1472         self.subqueries = (scoredquery, requiredquery)
    1473         self.boost = boost
    1474 
    1475     def copy(self):
    1476         return self.__class__(self.subqueries[0], self.subqueries[1],
     1505    matcherclass = RequireMatcher
     1506
     1507    def estimate_size(self, ixreader):
     1508        return self.b.estimate_size(ixreader)
     1509   
     1510    def estimate_min_size(self, ixreader):
     1511        return self.b.estimate_min_size(ixreader)
     1512
     1513    def normalize(self):
     1514        a = self.a.normalize()
     1515        b = self.b.normalize()
     1516        if a is NullQuery or b is NullQuery:
     1517            return NullQuery
     1518        return self.__class__(a, b, boost=self.boost)
     1519   
     1520    def replace(self, oldtext, newtext):
     1521        return self.__class__(self.a.replace(oldtext, newtext),
     1522                              self.b.replace(oldtext, newtext),
    14771523                              boost=self.boost)
    1478 
    1479     def estimate_size(self, ixreader):
    1480         return self.subqueries[1].estimate_size(ixreader)
    1481    
    1482     def estimate_min_size(self, ixreader):
    1483         return self.subqueries[1].estimate_min_size(ixreader)
    1484 
    1485     def normalize(self):
    1486         subqueries = [q.normalize() for q in self.subqueries]
    1487         if NullQuery in subqueries:
    1488             return NullQuery
    1489         return Require(subqueries[0], subqueries[1], boost=self.boost)
    1490 
     1524   
    14911525    def docs(self, searcher, exclude_docs=None):
    14921526        return And(self.subqueries).docs(searcher, exclude_docs=exclude_docs)
    14931527   
    1494     def matcher(self, searcher, exclude_docs=None):
    1495         scored, required = self.subqueries
    1496         return RequireMatcher(scored.matcher(searcher, exclude_docs=exclude_docs),
    1497                               required.matcher(searcher, exclude_docs=exclude_docs))
    1498 
    1499 
    1500 class AndMaybe(CompoundQuery):
     1528
     1529class AndMaybe(BinaryQuery):
    15011530    """Binary query takes results from the first query. If and only if the
    15021531    same document also appears in the results from the second query, the score
     
    15051534
    15061535    JOINT = " ANDMAYBE "
    1507 
    1508     def __init__(self, requiredquery, optionalquery, boost=1.0):
    1509         """
    1510         :param requiredquery: Documents matching this query are returned.
    1511         :param optionalquery: If a document matches this query as well as
    1512             'requiredquery', the score from this query is added to the
    1513             document score from 'requiredquery'.
    1514         """
    1515 
    1516         # The superclass CompoundQuery expects the subqueries to be
    1517         # in a sequence in self.subqueries
    1518         self.subqueries = (requiredquery, optionalquery)
    1519         self.boost = boost
    1520 
    1521     def copy(self):
    1522         return self.__class__(self.subqueries[0], self.subqueries[1],
    1523                               boost=self.boost)
     1536    matcherclass = AndMaybeMatcher
    15241537
    15251538    def normalize(self):
    1526         required, optional = (q.normalize() for q in self.subqueries)
    1527         if required is NullQuery:
     1539        a = self.a.normalize()
     1540        b = self.b.normalize()
     1541        if a is NullQuery:
    15281542            return NullQuery
    1529         if optional is NullQuery:
    1530             return required
    1531         return AndMaybe(required, optional, boost=self.boost)
     1543        if b is NullQuery:
     1544            return a
     1545        return self.__class__(a, b, boost=self.boost)
    15321546
    15331547    def estimate_min_size(self, ixreader):
     
    15361550    def docs(self, searcher, exclude_docs=None):
    15371551        return self.subqueries[0].docs(searcher, exclude_docs=exclude_docs)
    1538    
    1539     def matcher(self, searcher, exclude_docs=None):
    1540         required, optional = self.subqueries
    1541         return AndMaybeMatcher(required.matcher(searcher, exclude_docs=exclude_docs),
    1542                                 optional.matcher(searcher, exclude_docs=exclude_docs))
    1543 
    1544 
    1545 class AndNot(Query):
     1552
     1553
     1554class AndNot(BinaryQuery):
    15461555    """Binary boolean query of the form 'a ANDNOT b', where documents that
    15471556    match b are removed from the matches for a.
    15481557    """
    15491558
    1550     def __init__(self, positive, negative, boost=1.0):
    1551         """
    1552         :param positive: query to INCLUDE.
    1553         :param negative: query whose matches should be EXCLUDED.
    1554         :param boost: boost factor that should be applied to the raw score of
    1555             results matched by this query.
    1556         """
    1557 
    1558         self.positive = positive
    1559         self.negative = negative
    1560         self.boost = boost
    1561 
    1562     def __eq__(self, other):
    1563         return (other
    1564                 and self.__class__ is other.__class__
    1565                 and self.positive == other.positive
    1566                 and self.negative == other.negative
    1567                 and self.boost == other.boost)
    1568 
    1569     def __repr__(self):
    1570         return "%s(%r, %r)" % (self.__class__.__name__,
    1571                                self.positive, self.negative)
    1572 
    1573     def __unicode__(self):
    1574         return u"%s ANDNOT %s" % (self.positive, self.negative)
    1575 
    1576     def copy(self):
    1577         return self.__class__(self.positive, self.negative, boost=self.boost)
     1559    JOINT = " ANDNOT "
    15781560
    15791561    def normalize(self):
    1580         pos = self.positive.normalize()
    1581         neg = self.negative.normalize()
    1582 
    1583         if pos is NullQuery:
     1562        a = self.a.normalize()
     1563        b = self.b.normalize()
     1564
     1565        if a is NullQuery:
    15841566            return NullQuery
    1585         elif neg is NullQuery:
    1586             return pos
    1587 
    1588         return AndNot(pos, neg, boost=self.boost)
    1589 
    1590     def replace(self, oldtext, newtext):
    1591         return AndNot(self.positive.replace(oldtext, newtext),
    1592                       self.negative.replace(oldtext, newtext),
    1593                       boost=self.boost)
     1567        elif b is NullQuery:
     1568            return a
     1569
     1570        return self.__class__(a, b, boost=self.boost)
    15941571
    15951572    def _all_terms(self, termset, phrases=True):
    1596         self.positive.all_terms(termset, phrases=phrases)
     1573        self.a.all_terms(termset, phrases=phrases)
    15971574
    15981575    def _existing_terms(self, ixreader, termset, reverse=False, phrases=True):
    1599         self.positive.existing_terms(ixreader, termset, reverse=reverse,
    1600                                      phrases=phrases)
    1601 
    1602     def matcher(self, searcher, exclude_docs=None):
    1603         notvector = _not_vector(searcher, [self.negative], exclude_docs)
    1604         return self.positive.matcher(searcher, exclude_docs=notvector)
     1576        self.a.existing_terms(ixreader, termset, reverse=reverse,
     1577                              phrases=phrases)
     1578
     1579    def matcher(self, searcher, exclude_docs=None):
     1580        # This is faster than actually using an AndNotMatcher, but could use
     1581        # a lot of memory on a very large index.
     1582        # TODO: Switch based on size of index?
     1583        notvector = _not_vector(searcher, [self.b], exclude_docs)
     1584        return self.a.matcher(searcher, exclude_docs=notvector)
     1585
     1586
     1587class Otherwise(BinaryQuery):
     1588    """A binary query that only matches the second clause if the first clause
     1589    doesn't match any documents.
     1590    """
     1591   
     1592    JOINT = " OTHERWISE "
     1593   
     1594    def matcher(self, searcher, exclude_docs=None):
     1595        m = self.a.matcher(searcher, exclude_docs=exclude_docs)
     1596        if not m.is_active():
     1597            m = self.b.matcher(searcher, exclude_docs=exclude_docs)
     1598        return m
    16051599
    16061600
     
    16091603
    16101604
    1611 
    1612 
    1613 
    1614 
    1615 
    1616 
    1617 
    1618 
    1619 
    1620 
     1605           
     1606
     1607
     1608
     1609
     1610
     1611
     1612
     1613
     1614
  • branches/mbutscher/work/lib/whoosh/reading.py

    r230 r234  
    591591        postreaders = []
    592592        docoffsets = []
     593        excl = exclude_docs
     594        term = (fieldname, text)
     595       
    593596        for i, r in enumerate(self.readers):
    594             try:
     597            if term in r:
     598                offset = self.doc_offsets[i]
     599               
     600                # If an exclude_docs set was passed in, we need to pull out
     601                # the document numbers that apply to this reader and subtract
     602                # the offset from them
     603                if exclude_docs and i > 0:
     604                    limit = offset + r.doc_count_all()
     605                    # Create a subset of the exclude_docs set with the offset
     606                    # subtracted
     607                    excl = set(docnum - offset for docnum in exclude_docs
     608                               if docnum >= offset and docnum < limit)
     609               
     610                # Get a posting reader for the term and add it to the list
    595611                pr = r.postings(fieldname, text, scorer=scorer,
    596                                 exclude_docs=exclude_docs)
     612                                exclude_docs=excl)
    597613                postreaders.append(pr)
    598                 docoffsets.append(self.doc_offsets[i])
    599             except TermNotFound:
    600                 pass
     614                docoffsets.append(offset)
    601615       
    602616        if not postreaders:
  • branches/mbutscher/work/lib/whoosh/scoring.py

    r231 r234  
    479479                cache[docid] = i
    480480
    481         self.limit = i
    482481        self._fieldcache = cache
    483482        return cache
  • branches/mbutscher/work/lib/whoosh/searching.py

    r230 r234  
    872872    """Represents a single search result ("hit") in a Results object.
    873873   
     874    This object acts like a dictionary of the matching document's stored
     875    fields. If for some reason you need an actual ``dict`` object, use
     876    ``Hit.fields()`` to get one.
     877   
    874878    >>> r = searcher.search(query.Term("content", "render"))
    875879    >>> r[0]
    876880    <Hit {title=u"Rendering the scene"}>
     881    >>> r[0].rank
     882    0
    877883    >>> r[0].docnum
    878884    4592L
    879885    >>> r[0].score
    880     2.52045682
     886    2.52045682
     887    >>> r[0]["title"]
     888    "Rendering the scene"
     889    >>> r[0].keys()
     890    ["title"]
    881891    """
    882892   
     
    891901       
    892902        self.searcher = searcher
    893         self.pos = pos
     903        self.pos = self.rank = pos
    894904        self.docnum = docnum
    895905        self.score = score
    896906        self._fields = None
     907   
     908    def fields(self):
     909        """Returns a dictionary of the stored fields of the document this
     910        object represents.
     911        """
     912       
     913        if self._fields is None:
     914            self._fields = self.searcher.stored_fields(self.docnum)
     915        return self._fields
    897916   
    898917    def __repr__(self):
     
    907926            return False
    908927   
    909     def __iter__(self):
    910         return self.fields().iterkeys()
    911    
    912     def __getitem__(self, key):
    913         return self.fields().__getitem__(key)
    914    
    915     def __len__(self):
    916         return len(self.fields())
    917    
    918     def fields(self):
    919         if self._fields is None:
    920             self._fields = self.searcher.stored_fields(self.docnum)
    921         return self._fields
    922    
    923     def get(self, key, default=None):
    924         return self.fields().get(key, default)
     928    def __len__(self): return len(self.fields())
     929    def __iter__(self): return self.fields().iterkeys()
     930    def __getitem__(self, key): return self.fields().__getitem__(key)
     931    def __contains__(self, key): return key in self.fields()
     932    def items(self): return self.fields().items()
     933    def keys(self): return self.fields().keys()
     934    def values(self): return self.fields().values()
     935    def iteritems(self): return self.fields().iteritems()
     936    def iterkeys(self): return self.fields().iterkeys()
     937    def itervalues(self): return self.fields().itervalues()
     938    def get(self, key, default=None): return self.fields().get(key, default)
     939   
     940    def __setitem__(self, key, value):
     941        raise NotImplementedError("You cannot modify a search result")
     942    def __delitem__(self, key, value):
     943        raise NotImplementedError("You cannot modify a search result")
     944    def clear(self):
     945        raise NotImplementedError("You cannot modify a search result")
     946    def update(self, dict=None, **kwargs):
     947        raise NotImplementedError("You cannot modify a search result")
    925948   
    926949
  • branches/mbutscher/work/lib/whoosh/support/bench.py

    r230 r234  
    1818import os.path, random, sys
    1919from optparse import OptionParser
     20from shutil import rmtree
    2021from zlib import compress, decompress
    2122
     
    3637    pass
    3738
     39try:
     40    from persistent import Persistent
     41    class ZDoc(Persistent):
     42        def __init__(self, d):
     43            self.__dict__.update(d)
     44except ImportError:
     45    pass
     46
     47
     48class Module(object):
     49    def __init__(self, bench, options, args):
     50        self.bench = bench
     51        self.options = options
     52        self.args = args
     53   
     54    def __repr__(self):
     55        return self.__class__.__name__
     56   
     57    def indexer(self):
     58        pass
     59   
     60    def index_document(self, d):
     61        raise NotImplementedError
     62   
     63    def finish(self):
     64        pass
     65   
     66    def searcher(self):
     67        pass
     68   
     69    def query(self):
     70        raise NotImplementedError
     71   
     72    def find(self, q):
     73        raise NotImplementedError
     74   
     75    def findterms(self, terms):
     76        raise NotImplementedError
     77   
     78    def results(self, r):
     79        return r
     80
     81
     82class Spec(object):
     83    headline_field = "title"
     84    main_field = "body"
     85    whoosh_compress_main = False
     86   
     87    def __init__(self, options, args):
     88        self.options = options
     89        self.args = args
     90       
     91    def documents(self):
     92        raise NotImplementedError
     93   
     94    def setup(self):
     95        pass
     96   
     97    def print_results(self, ls):
     98        showbody = self.options.showbody
     99        limit = self.options.limit
     100        for i, hit in enumerate(ls):
     101            if i >= limit:
     102                break
     103           
     104            print "%d. %s" % (i+1, hit.get(self.headline_field))
     105            if showbody:
     106                print hit.get(self.main_field)
     107           
     108class WhooshModule(Module):
     109    def indexer(self):
     110        schema = self.bench.spec.whoosh_schema()
     111        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)
     112        if not os.path.exists(path):
     113            os.mkdir(path)
     114        ix = index.create_in(path, schema)
     115        self.writer = ix.writer(procs=int(self.options.procs),
     116                                limitmb=int(self.options.limitmb))
     117
     118    def index_document(self, d):
     119        if hasattr(self.bench, "process_document_whoosh"):
     120            self.bench.process_document_whoosh(d)
     121        if self.bench.spec.whoosh_compress_main:
     122            mf = self.bench.spec.main_field
     123            d["_stored_%s" % mf] = compress(d[mf], 9)
     124        self.writer.add_document(**d)
     125
     126    def finish(self):
     127        self.writer.commit()
     128       
     129    def searcher(self):
     130        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)
     131        ix = index.open_dir(path)
     132        self.srch = ix.searcher()
     133        self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema)
     134       
     135    def query(self):
     136        qstring = " ".join(self.args).decode("utf8")
     137        return self.parser.parse(qstring)
     138   
     139    def find(self, q):
     140        return self.srch.search(q, limit=int(self.options.limit))
     141   
     142    def results(self, r):
     143        mf = self.bench.spec.main_field
     144        for hit in r:
     145            fs = hit.fields()
     146            if self.bench.spec.whoosh_compress_main:
     147                fs[mf] = decompress(fs[mf])
     148            yield fs
     149   
     150    def findterms(self, terms):
     151        limit = int(self.options.limit)
     152        s = self.srch
     153        q = query.Term(self.main_field, None)
     154        for term in terms:
     155            q.text = term
     156            yield s.search(q, limit=limit)
     157   
     158
     159class XappyModule(Module):
     160    def indexer(self):
     161        path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
     162        conn = self.bench.spec.xappy_connection(path)
     163        return conn
     164   
     165    def index_document(self, conn, d):
     166        if hasattr(self.bench, "process_document_xappy"):
     167            self.bench.process_document_xappy(d)
     168        doc = xappy.UnprocessedDocument()
     169        for key, values in d:
     170            if not isinstance(values, list):
     171                values = [values]
     172            for value in values:
     173                doc.fields.append(xappy.Field(key, value))
     174        conn.add(doc)
     175
     176    def finish(self, conn):
     177        conn.flush()
     178       
     179    def searcher(self):
     180        path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
     181        return xappy.SearchConnection(path)
     182       
     183    def query(self, conn):
     184        return conn.query_parse(" ".join(self.args))
     185   
     186    def find(self, conn, q):
     187        return conn.search(q, 0, int(self.options.limit))
     188   
     189    def findterms(self, conn, terms):
     190        limit = int(self.options.limit)
     191        for term in terms:
     192            q = conn.query_field(self.main_field, term)
     193            yield conn.search(q, 0, limit)
     194   
     195    def results(self, r):
     196        hf = self.bench.spec.headline_field
     197        mf = self.bench.spec.main_field
     198        for hit in r:
     199            yield {hf: hit.data[hf], mf: hit.data[mf]}
     200       
     201
     202class XapianModule(Module):
     203    def indexer(self):
     204        path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname)
     205        self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
     206        self.ixer = xapian.TermGenerator()
     207       
     208    def index_document(self, d):
     209        if hasattr(self.bench, "process_document_xapian"):
     210            self.bench.process_document_xapian(d)
     211        doc = xapian.Document()
     212        doc.add_value(0, d.get(self.bench.spec.headline_field, "-"))
     213        doc.set_data(d[self.main_field])
     214        self.ixer.set_document(doc)
     215        self.ixer.index_text(d[self.main_field])
     216        self.database.add_document(doc)
     217       
     218    def finish(self):
     219        self.database.flush()
     220       
     221    def searcher(self):
     222        path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
     223        self.db = xapian.Database(path)
     224        self.enq = xapian.Enquire(self.db)
     225        self.qp = xapian.QueryParser()
     226        self.qp.set_database(self.db)
     227       
     228    def query(self):
     229        return self.qp.parse_query(" ".join(self.args))
     230   
     231    def find(self, q):
     232        self.enq.set_query(q)
     233        return self.enq.get_mset(0, int(self.options.limit))
     234   
     235    def findterms(self, terms):
     236        limit = int(self.options.limit)
     237        for term in terms:
     238            q = self.qp.parse_query(term)
     239            self.enq.set_query(q)
     240            yield self.enq.get_mset(0, limit)
     241   
     242    def results(self, matches):
     243        hf = self.bench.spec.headline_field
     244        mf = self.bench.spec.main_field
     245        for m in matches:
     246            yield {hf: m.document.get_value(0), mf: m.document.get_data()}
     247
     248
     249class SolrModule(Module):
     250    def indexer(self):
     251        self.solr_doclist = []
     252        self.conn = pysolr.Solr(self.options.url)
     253        self.conn.delete("*:*")
     254        self.conn.commit()
     255   
     256    def index_document(self, d):
     257        self.solr_doclist.append(d)
     258        if len(self.solr_doclist) >= int(self.options.batch):
     259            self.conn.add(self.solr_doclist, commit=False)
     260            self.solr_doclist = []
     261       
     262    def finish(self):
     263        if self.solr_doclist:
     264            self.conn.add(self.solr_doclist)
     265        del self.solr_doclist
     266        self.conn.optimize(block=True)
     267       
     268    def searcher(self):
     269        self.solr = pysolr.Solr(self.options.url)
     270   
     271    def query(self):
     272        return " ".join(self.args)
     273   
     274    def find(self, q):
     275        return self.solr.search(q, limit=int(self.options.limit))
     276   
     277    def findterms(self, terms):
     278        limit = int(self.options.limit)
     279        for term in terms:
     280            yield self.solr.search("body:" + term, limit=limit)
     281   
     282
     283class ZcatalogModule(Module):
     284    def indexer(self):
     285        from ZODB.FileStorage import FileStorage
     286        from ZODB.DB import DB
     287        from zcatalog import catalog
     288        from zcatalog import indexes
     289        import transaction
     290       
     291        dir = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname)
     292        if os.path.exists(dir):
     293            rmtree(dir)
     294        os.mkdir(dir)
     295       
     296        storage = FileStorage(os.path.join(dir, "index"))
     297        db = DB(storage)
     298        conn = db.open()
     299       
     300        self.cat = catalog.Catalog()
     301        self.bench.spec.zcatalog_setup(self.cat)
     302        conn.root()["cat"] = self.cat
     303        transaction.commit()
     304       
     305        self.zcatalog_count = 0
     306   
     307    def index_document(self, d):
     308        if hasattr(self.bench, "process_document_zcatalog"):
     309            self.bench.process_document_zcatalog(d)
     310        doc = ZDoc(d)
     311        self.cat.index_doc(doc)
     312        self.zcatalog_count += 1
     313        if self.zcatalog_count >= 100:
     314            import transaction
     315            transaction.commit()
     316            self.zcatalog_count = 0
     317       
     318    def finish(self):
     319        import transaction
     320        transaction.commit()
     321        del self.zcatalog_count
     322       
     323    def searcher(self):
     324        from ZODB.FileStorage import FileStorage
     325        from ZODB.DB import DB
     326        from zcatalog import catalog
     327        from zcatalog import indexes
     328        import transaction
     329       
     330        path = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname, "index")
     331        storage = FileStorage(path)
     332        db = DB(storage)
     333        conn = db.open()
     334       
     335        self.cat = conn.root()["cat"]
     336   
     337    def query(self):
     338        return " ".join(self.args)
     339   
     340    def find(self, q):
     341        return self.cat.searchResults(body=q)
     342   
     343    def findterms(self, terms):
     344        for term in terms:
     345            yield self.cat.searchResults(body=term)
     346   
     347    def results(self, r):
     348        hf = self.bench.spec.headline_field
     349        mf = self.bench.spec.main_field
     350        for hit in r:
     351            # Have to access the attributes for them to be retrieved
     352            yield {hf: getattr(hit, hf), mf: getattr(hit, mf)}
    38353
    39354
    40355class Bench(object):
    41     solr_url = "http://localhost:8983/solr"
    42     main_field = "text"
    43     headline_field = "title"
    44    
    45     libs = ("whoosh", "xappy", "xapian", "solr")
    46    
    47     _name = "unknown"
    48    
    49     def name(self):
    50         return self._name
    51    
    52     def process_document_whoosh(self, d):
    53         pass
    54    
    55     def process_document_xappy(self, d):
    56         pass
    57    
    58     def process_document_xapian(self, d):
    59         pass
    60    
    61     def process_document_solr(self, d):
    62         pass
     356    libs = {"whoosh": WhooshModule, "xappy": XappyModule,
     357            "xapian": XapianModule, "solr": SolrModule,
     358            "zcatalog": ZcatalogModule}
    63359   
    64360    def index(self, lib):
     
    73369       
    74370        starttime = chunkstarttime = now()
    75         ix = getattr(self, "%s_indexer" % lib)()
    76         index_document = getattr(self, "index_document_%s" % lib)
    77         for d in self.documents():
     371        lib.indexer()
     372        for d in self.spec.documents():
    78373            skipc -= 1
    79374            if not skipc:
    80                 index_document(ix, d)
     375                lib.index_document(d)
    81376                count += 1
    82377                skipc = skip
     
    91386        spooltime = now()
    92387        print "Spool time:", spooltime - starttime
    93         getattr(self, "finish_%s" % lib)(ix)
     388        lib.finish()
    94389        committime = now()
    95390        print "Commit time:", committime - spooltime
    96391        print "Total time to index", count, "documents:",  committime - starttime
    97392   
    98     def whoosh_indexer(self):
    99         schema = self.whoosh_schema()
    100         path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)
    101         if not os.path.exists(path):
    102             os.mkdir(path)
    103         ix = index.create_in(path, schema)
    104         w = ix.writer(procs=int(self.options.procs),
    105                       limitmb=int(self.options.limitmb))
    106         return w
    107    
    108     def index_document_whoosh(self, writer, d):
    109         self.process_document_whoosh(d)
    110         writer.add_document(**d)
    111        
    112     def finish_whoosh(self, writer):
    113         writer.commit()
    114        
    115     def xappy_indexer(self):
    116         path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
    117         conn = self.xappy_connection(path)
    118         return conn
    119    
    120     def index_document_xappy(self, conn, d):
    121         self.process_document_xappy(d)
    122         doc = xappy.UnprocessedDocument()
    123         for key, values in d:
    124             if not isinstance(values, list):
    125                 values = [values]
    126             for value in values:
    127                 doc.fields.append(xappy.Field(key, value))
    128         conn.add(doc)
    129        
    130     def finish_xappy(self, conn):
    131         conn.flush()
    132                    
    133     def xapian_indexer(self):
    134         path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname)
    135         database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)
    136         indexer = xapian.TermGenerator()
    137        
    138         return (database, indexer)
    139    
    140     def index_document_xapian(self, dix, d):
    141         self.process_document_xapian(d)
    142         database, indexer = dix
    143         doc = xapian.Document()
    144         doc.add_value(0, d.get(self.headline_field, "-"))
    145         doc.set_data(d[self.main_field])
    146         indexer.set_document(doc)
    147         indexer.index_text(d[self.main_field])
    148         database.add_document(doc)
    149        
    150     def finish_xapian(self, dix):
    151         dix[0].flush()
    152        
    153     def solr_indexer(self):
    154         self.solr_doclist = []
    155         conn = pysolr.Solr(self.options.url)
    156         conn.delete("*:*")
    157         conn.commit()
    158         return conn
    159    
    160     def index_document_solr(self, conn, d):
    161         self.solr_doclist.append(d)
    162         if len(self.solr_doclist) >= int(self.options.batch):
    163             conn.add(self.solr_doclist, commit=False)
    164             self.solr_doclist = []
    165        
    166     def finish_solr(self, conn):
    167         if self.solr_doclist:
    168             conn.add(self.solr_doclist)
    169         del self.solr_doclist
    170         conn.optimize(block=True)
    171    
    172     def whoosh_searcher(self):
    173         path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)
    174         ix = index.open_dir(path)
    175         searcher = ix.searcher()
    176         parser = qparser.QueryParser(self.main_field, schema=ix.schema)
    177        
    178         return (searcher, parser)
    179    
    180     def whoosh_query(self, s):
    181         qstring = " ".join(self.args).decode("utf8")
    182         return s[1].parse(qstring)
    183    
    184     def whoosh_find(self, s, q):
    185         return s[0].search(q, limit=int(self.options.limit))
    186    
    187     def whoosh_findterms(self, s, terms):
    188         limit = int(self.options.limit)
    189         searcher = s[0]
    190         q = query.Term(self.main_field, None)
    191         for term in terms:
    192             q.text = term
    193             yield searcher.search(q, limit=limit)
    194    
    195     def whoosh_results(self, s, r):
    196         showbody = self.options.showbody
    197        
    198         print "Runtime:", r.runtime
    199         for hit in r:
    200             print hit.get(self.headline_field)
    201             if showbody:
    202                 print decompress(hit[self.main_field])
    203    
    204     def xappy_searcher(self):
    205         path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
    206         return xappy.SearchConnection(path)
    207        
    208     def xappy_query(self, conn):
    209         return conn.query_parse(" ".join(self.args))
    210    
    211     def xappy_find(self, conn, q):
    212         return conn.search(q, 0, int(self.options.limit))
    213    
    214     def xappy_findterms(self, conn, terms):
    215         limit = int(self.options.limit)
    216         for term in terms:
    217             q = conn.query_field(self.main_field, term)
    218             yield conn.search(q, 0, limit)
    219    
    220     def xappy_results(self, conn, r):
    221         showbody = self.options.showbody
    222         for hit in r:
    223             print hit.rank, hit.data[self.headline_field]
    224             if showbody:
    225                 print hit.data[self.main_field]
    226    
    227     def xapian_searcher(self):
    228         path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)
    229         db = xapian.Database(path)
    230         enq = xapian.Enquire(db)
    231         qp = xapian.QueryParser()
    232         qp.set_database(db)
    233         return db, enq, qp
    234    
    235     def xapian_query(self, s):
    236         return s[2].parse_query(" ".join(self.args))
    237    
    238     def xapian_find(self, s, q):
    239         enq = s[1]
    240         enq.set_query(q)
    241         return enq.get_mset(0, int(self.options.limit))
    242    
    243     def xapian_findterms(self, s, terms):
    244         limit = int(self.options.limit)
    245         db, enq, qp = s
    246         for term in terms:
    247             q = qp.parse_query(term)
    248             enq.set_query(q)
    249             yield enq.get_mset(0, limit)
    250    
    251     def xapian_results(self, s, matches):
    252         showbody = self.options.showbody
    253         for m in matches:
    254             print m.rank, repr(m.document.get_value(0))
    255             if showbody:
    256                 print m.document.get_data()
    257    
    258     def solr_searcher(self):
    259         return pysolr.Solr(self.solr_url)
    260    
    261     def solr_query(self, solr):
    262         return " ".join(self.args)
    263    
    264     def solr_find(self, solr, q):
    265         return solr.search(q, limit=int(self.options.limit))
    266    
    267     def solr_findterms(self, solr, terms):
    268         limit = int(self.options.limit)
    269         for term in terms:
    270             yield solr.search("body:" + term, limit=limit)
    271    
    272     def solr_results(self, solr, r):
    273         showbody = self.options.showbody
    274         print len(r), "results"
    275         for hit in r:
    276             print hit.get(self.headline_field)
    277             if showbody:
    278                 print hit[self.main_field]
    279    
    280393    def search(self, lib):
    281         s = getattr(self, "%s_searcher" % lib)()
     394        lib.searcher()
     395       
    282396        t = now()
    283         q = getattr(self, "%s_query" % lib)(s)
     397        q = lib.query()
    284398        print "Query:", q
    285         r = getattr(self, "%s_find" % lib)(s, q)
     399        r = lib.find(q)
    286400        print "Search time:", now() - t
    287401       
    288402        t = now()
    289         getattr(self, "%s_results" % lib)(s, r)
     403        self.spec.print_results(lib.results(r))
    290404        print "Print time:", now() - t
    291405   
     
    296410       
    297411        print "Searching %d terms with %s" % (len(terms), lib)
    298         s = getattr(self, "%s_searcher" % lib)()
     412        lib.searcher()
    299413        starttime = now()
    300         for r in getattr(self, "%s_findterms" % lib)(s, terms):
     414        for r in lib.findterms(terms):
    301415            pass
    302416        searchtime = now() - starttime
    303417        print "Search time:", searchtime, "searches/s:", float(len(terms))/searchtime
    304418   
    305     def generate_search_file(self, lib):
    306         if self.args:
    307             f = open(self.args[0], "wb")
    308         else:
    309             f = sys.stdout
    310         count = int(self.options.generate)
    311        
    312         t = now()
    313         s = self.whoosh_searcher()[0]
    314         terms = list(s.lexicon(self.main_field))
    315         sample = random.sample(terms, count)
    316         for term in sample:
    317             if term.isalnum():
    318                 f.write(term + "\n")
    319         print now() - t
    320    
    321     def _parser(self):
     419    def _parser(self, name):
    322420        p = OptionParser()
    323421        p.add_option("-x", "--lib", dest="lib",
     
    331429                     help="Index the documents.", default=False)
    332430        p.add_option("-n", "--name", dest="indexname", metavar="PREFIX",
    333                      help="Index name prefix.", default="%s_index" % self.name())
     431                     help="Index name prefix.", default="%s_index" % name)
    334432        p.add_option("-U", "--url", dest="url", metavar="URL",
    335433                     help="Solr URL", default="http://localhost:8983/solr")
     
    363461        return p
    364462   
    365     def run(self):
    366         parser = self._parser()
     463    def run(self, specclass):
     464        parser = self._parser(specclass.name)
    367465        options, args = parser.parse_args()
    368466        self.options = options
    369467        self.args = args
    370468       
    371         lib = options.lib
    372         if lib not in self.libs:
    373             raise Exception("Unknown library: %r" % lib)
     469        if options.lib not in self.libs:
     470            raise Exception("Unknown library: %r" % options.lib)
     471        lib = self.libs[options.lib](self, options, args)
     472       
     473        self.spec = specclass(options, args)
    374474       
    375475        if options.setup:
    376             self.setup()
     476            self.spec.setup()
    377477       
    378478        action = self.search
  • branches/mbutscher/work/lib/whoosh/support/unicode.py

    r230 r234  
    249249    return _names[i]
    250250
     251
    251252def blocknum(ch):
    252253    """Returns the unicode block number for ch, or None if ch has no block.
     
    268269
    269270
    270 if __name__ == "__main__":
    271     pass
    272    
    273    
    274    
    275    
    276    
     271   
     272   
     273   
     274   
     275   
  • branches/stable-2.0/extensions/GnuplotClBridge.py

    r231 r234  
    117117#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    118118            popenObject = subprocess.Popen(cmdline, shell=True,
    119                     stderr=subprocess.PIPE)
     119                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     120                    stdin=subprocess.PIPE)
    120121            childErr = popenObject.stderr
    121122           
     123            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     124            # why this is necessary
     125            popenObject.stdin.close()
     126            popenObject.stdout.close()
     127
    122128            if u"noerror" in [a.strip() for a in insToken.appendices]:
    123129                childErr.read()
  • branches/stable-2.0/extensions/GraphvizClBridge.py

    r231 r234  
    124124#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    125125            popenObject = subprocess.Popen(cmdline, shell=True,
    126                     stderr=subprocess.PIPE)
     126                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     127                    stdin=subprocess.PIPE)
    127128            childErr = popenObject.stderr
     129
     130            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     131            # why this is necessary
     132            popenObject.stdin.close()
     133            popenObject.stdout.close()
    128134
    129135            if u"noerror" in [a.strip() for a in insToken.appendices]:
  • branches/stable-2.0/extensions/GraphvizStructureView.py

    r231 r234  
    342342#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    343343            popenObject = subprocess.Popen(cmdline, shell=True,
    344                     stderr=subprocess.PIPE)
     344                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     345                    stdin=subprocess.PIPE)
    345346            childErr = popenObject.stderr
     347
     348            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     349            # why this is necessary
     350            popenObject.stdin.close()
     351            popenObject.stdout.close()
    346352
    347353            if u"noerror" in [a.strip() for a in insParams]:
  • branches/stable-2.0/extensions/MimeTexCGIBridge.py

    r231 r234  
    103103        # Run MimeTeX process
    104104        popenObject = subprocess.Popen(cmdline, shell=True,
    105                 stdout=subprocess.PIPE)
     105                 stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     106                 stderr=subprocess.PIPE)
     107
    106108        childOut = popenObject.stdout
     109       
     110        # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     111        # why this is necessary
     112        popenObject.stdin.close()
     113        popenObject.stderr.close()
    107114
    108115        # Read stdout of process entirely
  • branches/stable-2.0/extensions/PloticusClBridge.py

    r231 r234  
    128128#             childIn, childOut, childErr = os.popen3(cmdline, "b")
    129129            popenObject = subprocess.Popen(cmdline, shell=True,
    130                     stderr=subprocess.PIPE)
     130                    stderr=subprocess.PIPE, stdout=subprocess.PIPE,
     131                    stdin=subprocess.PIPE)
    131132            childErr = popenObject.stderr
    132            
     133
     134            # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error
     135            # why this is necessary
     136            popenObject.stdin.close()
     137            popenObject.stdout.close()
     138
    133139            if u"noerror" in [a.strip() for a in insToken.appendices]:
    134140                childErr.read()
Note: See TracChangeset for help on using the changeset viewer.