Changeset 234
- Timestamp:
- 01/09/11 19:37:51 (2 years ago)
- Location:
- branches
- Files:
-
- 4 added
- 33 modified
-
mbutscher/work/extensions/GnuplotClBridge.py (modified) (1 diff)
-
mbutscher/work/extensions/GraphvizClBridge.py (modified) (1 diff)
-
mbutscher/work/extensions/GraphvizStructureView.py (modified) (1 diff)
-
mbutscher/work/extensions/MimeTexCGIBridge.py (modified) (1 diff)
-
mbutscher/work/extensions/PloticusClBridge.py (modified) (1 diff)
-
mbutscher/work/lib/pwiki/DocPages.py (modified) (3 diffs)
-
mbutscher/work/lib/pwiki/PersonalWikiFrame.py (modified) (1 diff)
-
mbutscher/work/lib/pwiki/SearchAndReplace.py (modified) (3 diffs)
-
mbutscher/work/lib/pwiki/SearchAndReplaceDialogs.py (modified) (6 diffs)
-
mbutscher/work/lib/pwiki/wikidata/WikiDataManager.py (modified) (6 diffs)
-
mbutscher/work/lib/pwiki/wikidata/compact_sqlite/DbStructure.py (modified) (2 diffs)
-
mbutscher/work/lib/pwiki/wikidata/original_gadfly/DbStructure.py (modified) (2 diffs)
-
mbutscher/work/lib/pwiki/wikidata/original_sqlite/DbStructure.py (modified) (2 diffs)
-
mbutscher/work/lib/whoosh/analysis.py (modified) (22 diffs)
-
mbutscher/work/lib/whoosh/fields.py (modified) (2 diffs)
-
mbutscher/work/lib/whoosh/filedb/fileindex.py (modified) (2 diffs)
-
mbutscher/work/lib/whoosh/filedb/filepostings.py (modified) (1 diff)
-
mbutscher/work/lib/whoosh/formats.py (modified) (1 diff)
-
mbutscher/work/lib/whoosh/highlight.py (modified) (7 diffs)
-
mbutscher/work/lib/whoosh/lang/phonetic.py (added)
-
mbutscher/work/lib/whoosh/lang/porter2.py (modified) (4 diffs)
-
mbutscher/work/lib/whoosh/matching.py (modified) (4 diffs)
-
mbutscher/work/lib/whoosh/qparser/common.py (added)
-
mbutscher/work/lib/whoosh/qparser/default.py (modified) (4 diffs)
-
mbutscher/work/lib/whoosh/qparser/plugins.py (added)
-
mbutscher/work/lib/whoosh/qparser/syntax.py (added)
-
mbutscher/work/lib/whoosh/query.py (modified) (13 diffs)
-
mbutscher/work/lib/whoosh/reading.py (modified) (1 diff)
-
mbutscher/work/lib/whoosh/scoring.py (modified) (1 diff)
-
mbutscher/work/lib/whoosh/searching.py (modified) (3 diffs)
-
mbutscher/work/lib/whoosh/support/bench.py (modified) (7 diffs)
-
mbutscher/work/lib/whoosh/support/unicode.py (modified) (2 diffs)
-
stable-2.0/extensions/GnuplotClBridge.py (modified) (1 diff)
-
stable-2.0/extensions/GraphvizClBridge.py (modified) (1 diff)
-
stable-2.0/extensions/GraphvizStructureView.py (modified) (1 diff)
-
stable-2.0/extensions/MimeTexCGIBridge.py (modified) (1 diff)
-
stable-2.0/extensions/PloticusClBridge.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
branches/mbutscher/work/extensions/GnuplotClBridge.py
r231 r234 117 117 # childIn, childOut, childErr = os.popen3(cmdline, "b") 118 118 popenObject = subprocess.Popen(cmdline, shell=True, 119 stderr=subprocess.PIPE) 119 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 120 stdin=subprocess.PIPE) 120 121 childErr = popenObject.stderr 121 122 123 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 124 # why this is necessary 125 popenObject.stdin.close() 126 popenObject.stdout.close() 127 122 128 if u"noerror" in [a.strip() for a in insToken.appendices]: 123 129 childErr.read() -
branches/mbutscher/work/extensions/GraphvizClBridge.py
r231 r234 124 124 # childIn, childOut, childErr = os.popen3(cmdline, "b") 125 125 popenObject = subprocess.Popen(cmdline, shell=True, 126 stderr=subprocess.PIPE) 126 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 127 stdin=subprocess.PIPE) 127 128 childErr = popenObject.stderr 129 130 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 131 # why this is necessary 132 popenObject.stdin.close() 133 popenObject.stdout.close() 128 134 129 135 if u"noerror" in [a.strip() for a in insToken.appendices]: -
branches/mbutscher/work/extensions/GraphvizStructureView.py
r231 r234 342 342 # childIn, childOut, childErr = os.popen3(cmdline, "b") 343 343 popenObject = subprocess.Popen(cmdline, shell=True, 344 stderr=subprocess.PIPE) 344 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 345 stdin=subprocess.PIPE) 345 346 childErr = popenObject.stderr 347 348 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 349 # why this is necessary 350 popenObject.stdin.close() 351 popenObject.stdout.close() 346 352 347 353 if u"noerror" in [a.strip() for a in insParams]: -
branches/mbutscher/work/extensions/MimeTexCGIBridge.py
r231 r234 103 103 # Run MimeTeX process 104 104 popenObject = subprocess.Popen(cmdline, shell=True, 105 stdout=subprocess.PIPE) 105 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 106 stderr=subprocess.PIPE) 107 106 108 childOut = popenObject.stdout 109 110 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 111 # why this is necessary 112 popenObject.stdin.close() 113 popenObject.stderr.close() 107 114 108 115 # Read stdout of process entirely -
branches/mbutscher/work/extensions/PloticusClBridge.py
r231 r234 128 128 # childIn, childOut, childErr = os.popen3(cmdline, "b") 129 129 popenObject = subprocess.Popen(cmdline, shell=True, 130 stderr=subprocess.PIPE) 130 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 131 stdin=subprocess.PIPE) 131 132 childErr = popenObject.stderr 132 133 134 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 135 # why this is necessary 136 popenObject.stdin.close() 137 popenObject.stdout.close() 138 133 139 if u"noerror" in [a.strip() for a in insToken.appendices]: 134 140 childErr.read() -
branches/mbutscher/work/lib/pwiki/DocPages.py
r230 r234 1 1 from __future__ import with_statement 2 3 # import profilehooks 4 # profile = profilehooks.profile(filename="profile.prf", immediate=False) 2 ## import profilehooks 3 ## profile = profilehooks.profile(filename="profile.prf", immediate=False) 5 4 6 5 … … 27 26 28 27 import Serialization 29 # from Serialization import SerializeStream30 28 31 29 … … 1736 1734 return valid 1737 1735 1738 1736 1739 1737 def putIntoSearchIndex(self, threadstop=DUMBTHREADSTOP): 1740 1738 """ 1741 Add or update the reverseindex for the given docPage1739 Add or update the index for the given docPage 1742 1740 """ 1743 1741 with self.textOperationLock: -
branches/mbutscher/work/lib/pwiki/PersonalWikiFrame.py
r231 r234 2 2 from __future__ import with_statement 3 3 4 ## import hotshot5 ## _prof = hotshot.Profile("hotshot.prf")6 4 ## import profilehooks 7 5 ## profile = profilehooks.profile(filename="profile.prf", immediate=False) -
branches/mbutscher/work/lib/pwiki/SearchAndReplace.py
r230 r234 1697 1697 """ 1698 1698 if self.searchStr == u"": 1699 self.searchOpTree = AllWikiPagesNode(self) 1700 return 1701 1702 if self.indexSearch != "no": 1703 # Search tree not used, but non-None value needed 1699 1704 self.searchOpTree = AllWikiPagesNode(self) 1700 1705 return … … 1808 1813 1809 1814 1815 def getWhooshIndexQuery(self, wikiDocument): 1816 from whoosh.qparser import QueryParser 1817 1818 qp = QueryParser("content", schema=wikiDocument.getWhooshIndexSchema()) 1819 q = qp.parse(self.searchStr) 1820 # print "--getWhooshIndexQuery10", repr((qp, q)) 1821 1822 return q 1823 1824 1825 def hasWhooshHighlighting(self): 1826 """ 1827 Return True iff call to highlightWhooshIndexFound() would work. 1828 """ 1829 return self.indexSearch == "default" 1830 1831 1832 def highlightWhooshIndexFound(self, content, docPage, before, after, 1833 formatter=None): 1834 """ 1835 Retrieve formatted output with highlighted search hits for a page. 1836 formatter -- whoosh formatter or None (uses SimpleHtmlFormatter then) 1837 """ 1838 if docPage is None: 1839 return 1840 1841 from whoosh import highlight 1842 1843 # TODO: Loop invariant, move out? 1844 q = self.getWhooshIndexQuery(docPage.getWikiDocument()) 1845 1846 # Extract the terms the user mentioned 1847 terms = [text for fieldname, text in q.all_terms() 1848 if fieldname == "content"] 1849 1850 analyzer = docPage.getWikiDocument().getWhooshIndexContentAnalyzer() 1851 1852 # TODO: Length of before and after from config 1853 fragmenter = highlight.ContextFragmenter(terms, (before + after) * 2, 1854 before, after) 1855 1856 if formatter is None: 1857 formatter = highlight.SimpleHtmlFormatter() 1858 1859 return highlight.highlight(content, terms, analyzer, 1860 fragmenter, formatter, top=1) 1861 1862 1810 1863 def hasParticularTextPosition(self): 1811 1864 if self.indexSearch != "no": … … 1855 1908 if self.searchOpTree is None: 1856 1909 self.rebuildSearchOpTree() 1857 1910 1858 1911 if commonCache is None: 1859 1912 commonCache = {} 1860 1913 1861 1914 self.listWikiPagesOp.beginWikiSearch(wikiDocument, 1862 1915 commonCache=commonCache) 1863 1916 1864 1917 return self.searchOpTree.beginWikiSearch(wikiDocument, 1865 1918 commonCache=commonCache) -
branches/mbutscher/work/lib/pwiki/SearchAndReplaceDialogs.py
r230 r234 1 # # import hotshot2 # # _prof = hotshot.Profile("hotshot.prf")1 # import profilehooks 2 # profile = profilehooks.profile(filename="profile.prf", immediate=False) 3 3 4 4 import sys, traceback, re, threading, time … … 80 80 self.occNumber = occNumber 81 81 return self 82 83 84 def setHtmlDirectly(self, occHtml): 85 self.occNumber = -1 86 self.occCount = -1 87 self.occHtml = occHtml 88 82 89 83 90 … … 220 227 context = before + after 221 228 222 if notsarOp.hasParticularTextPosition():223 # No specific position to show as context, so show beginning of page224 # Also, no occurrence counting possible225 if context == 0:229 if sarOp.hasParticularTextPosition(): 230 if context == 0 and not countOccurrences: 231 # No context, no occurrence counting 232 # -> just a list of found pages 226 233 self.foundinfo = [_SearchResultItemInfo(w) for w in found] 227 234 else: 228 for w in found: 229 text = wikiDocument.getWikiPageNoError(w).\ 230 getLiveTextNoTemplate() 231 if text is None: 232 continue 233 self.foundinfo.append( 234 _SearchResultItemInfo(w).buildOccurrence( 235 text, before, after, (-1, -1), -1, 100)) 236 threadstop.testRunning() 237 else: 238 if context == 0 and not countOccurrences: 239 # No context, no occurrence counting 240 self.foundinfo = [_SearchResultItemInfo(w) for w in found] 241 else: 235 # "As is" or regex search 242 236 sarOp.beginWikiSearch(self.pWiki.getWikiDocument()) 243 237 try: … … 270 264 info = _SearchResultItemInfo(w, occPos=pos, 271 265 maxOccCount=maxCountOccurrences) 272 266 273 267 if countOccurrences: 274 268 occ = 1 … … 290 284 finally: 291 285 sarOp.endWikiSearch() 292 286 elif sarOp.hasWhooshHighlighting(): 287 # Index search 288 if context == 0: 289 # No context, occurrence counting doesn't matter 290 # -> just a list of found pages 291 self.foundinfo = [_SearchResultItemInfo(w) for w in found] 292 else: 293 sarOp.beginWikiSearch(self.pWiki.getWikiDocument()) 294 try: 295 for w in found: 296 threadstop.testRunning() 297 docPage = wikiDocument.getWikiPageNoError(w) 298 text = docPage.getLiveTextNoTemplate() 299 if text is None: 300 continue 301 302 html, firstPos = sarOp.highlightWhooshIndexFound( 303 text, docPage, before, after) 304 305 info = _SearchResultItemInfo(w, occPos=(firstPos, firstPos)) 306 info.setHtmlDirectly(html) 307 308 self.foundinfo.append(info) 309 finally: 310 sarOp.endWikiSearch() 311 else: # not sarOp.hasParticularTextPosition(): 312 # No specific position to show as context, so show beginning of page 313 # Also, no occurrence counting possible 314 if context == 0: 315 self.foundinfo = [_SearchResultItemInfo(w) for w in found] 316 else: 317 for w in found: 318 text = wikiDocument.getWikiPageNoError(w).\ 319 getLiveTextNoTemplate() 320 if text is None: 321 continue 322 self.foundinfo.append( 323 _SearchResultItemInfo(w).buildOccurrence( 324 text, before, after, (-1, -1), -1, 100)) 325 threadstop.testRunning() 326 293 327 threadstop.testRunning() 294 328 self.isShowingSearching = False … … 1093 1127 1094 1128 1129 # @profile 1095 1130 def _refreshPageList(self): 1096 1131 sarOp = self._buildSearchReplaceOperation() -
branches/mbutscher/work/lib/pwiki/wikidata/WikiDataManager.py
r231 r234 1 1 from __future__ import with_statement 2 2 3 3 4 from weakref import WeakValueDictionary … … 1291 1292 1292 1293 if self.isSearchIndexEnabled(): 1293 # Step four: update reverseindex1294 # Step four: update index 1294 1295 for wikiWord in wikiWords: 1295 1296 progresshandler.update(step, _(u"Update index of %s") % wikiWord) … … 1596 1597 return result 1597 1598 else: 1598 # Processing reverse index search 1599 from whoosh.qparser import QueryParser 1600 1599 # Processing index search 1601 1600 threadstop.testRunning() 1602 1601 if not self.isSearchIndexEnabled(): 1603 1602 return [] 1604 1603 1605 qp = QueryParser("content", schema=self._getSearchIndexSchema()) 1606 q = qp.parse(sarOp.searchStr) 1604 q = sarOp.getWhooshIndexQuery(self) 1607 1605 s = self.getSearchIndex().searcher() 1608 1606 threadstop.testRunning() 1609 resultList = s.search(q) 1607 resultList = s.search(q, limit=None) 1608 1609 # docnumList = [(rd.docnum, rd["unifName"]) for rd in resultList] 1610 # 1611 # docnumList.sort() 1612 # docpp = "\n".join(["%3i %s" % rd for rd in docnumList]) 1613 # 1614 # print "--docResults" 1615 # print docpp.encode("mbcs", "replace") 1616 1610 1617 result = [rd["unifName"][9:] for rd in resultList 1611 1618 if rd["unifName"].startswith(u"wikipage/")] … … 1614 1621 return result 1615 1622 1616 1623 1624 @staticmethod 1625 def getWhooshIndexContentAnalyzer(): 1626 from whoosh.analysis import StandardAnalyzer 1627 return StandardAnalyzer(stoplist=None) 1628 1629 1630 1617 1631 _REV_SEARCH_INDEX_SCHEMA = None 1618 1632 1619 1633 @staticmethod 1620 def _getSearchIndexSchema():1634 def getWhooshIndexSchema(): 1621 1635 if WikiDataManager._REV_SEARCH_INDEX_SCHEMA is None: 1622 1636 from whoosh.fields import Schema, ID, NUMERIC, TEXT 1623 from whoosh.analysis import StandardAnalyzer1624 1637 1625 1638 WikiDataManager._REV_SEARCH_INDEX_SCHEMA = Schema( 1626 1639 unifName=ID(stored=True, unique=True), 1627 1640 modTimestamp=NUMERIC(), content=TEXT( 1628 analyzer= StandardAnalyzer(stoplist=None)))1641 analyzer=WikiDataManager.getWhooshIndexContentAnalyzer())) 1629 1642 1630 1643 return WikiDataManager._REV_SEARCH_INDEX_SCHEMA 1644 1645 1631 1646 1632 1647 … … 1687 1702 1688 1703 if clear or not whoosh.index.exists_in(indexPath): 1689 schema = self. _getSearchIndexSchema()1704 schema = self.getWhooshIndexSchema() 1690 1705 whoosh.index.create_in(indexPath, schema) 1691 1706 … … 1741 1756 1742 1757 1743 1744 def putIntoSearchIndex(self, wikiPage):1745 """1746 Add or update the reverse index for the given docPage1747 """1748 if not self.isSearchIndexEnabled():1749 return1750 1751 if isinstance(wikiPage, AliasWikiPage):1752 wikiPage = WikiPage(self, wikiWord)1753 1754 content = wikiPage.getLiveText()1755 1756 try:1757 searchIdx = self.getSearchIndex()1758 writer = searchIdx.writer()1759 1760 unifName = wikiPage.getUnifiedPageName()1761 1762 writer.delete_by_term("unifName", unifName)1763 writer.add_document(unifName=unifName,1764 modTimestamp=wikiPage.getTimestamps()[0],1765 content=content)1766 except:1767 writer.cancel()1768 raise1769 1770 writer.commit()1771 1772 1773 1758 def removeFromSearchIndex(self, unifName): 1774 1759 if not self.isSearchIndexEnabled(): -
branches/mbutscher/work/lib/pwiki/wikidata/compact_sqlite/DbStructure.py
r216 r234 11 11 from os.path import exists, join 12 12 13 import Consts 13 14 from pwiki.WikiExceptions import * 14 15 from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, applyBinCompact, \ … … 1284 1285 connwrap.execSql("insert or replace into settings(key, value) " 1285 1286 "values ('lastwritever', '"+str(VERSION_DB)+"')") 1287 1288 # Write which program version at last wrote to database 1289 connwrap.execSql("insert or replace into settings(key, value) " 1290 "values ('lastwriteprogver.branchtag', '"+Consts.VERSION_TUPLE[0]+"')") 1291 connwrap.execSql("insert or replace into settings(key, value) " 1292 "values ('lastwriteprogver.major', '"+str(Consts.VERSION_TUPLE[1])+"')") 1293 connwrap.execSql("insert or replace into settings(key, value) " 1294 "values ('lastwriteprogver.minor', '"+str(Consts.VERSION_TUPLE[2])+"')") 1295 connwrap.execSql("insert or replace into settings(key, value) " 1296 "values ('lastwriteprogver.sub', '"+str(Consts.VERSION_TUPLE[3])+"')") 1297 connwrap.execSql("insert or replace into settings(key, value) " 1298 "values ('lastwriteprogver.patch', '"+str(Consts.VERSION_TUPLE[4])+"')") 1286 1299 except sqlite.ReadOnlyDbError: 1287 1300 pass -
branches/mbutscher/work/lib/pwiki/wikidata/original_gadfly/DbStructure.py
r214 r234 12 12 import glob 13 13 14 import Consts 14 15 from pwiki.WikiExceptions import * 15 16 from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, \ … … 1130 1131 try: 1131 1132 setSettingsValue(connwrap, "lastwritever", str(VERSION_DB)) 1133 1134 # Write which program version at last wrote to database 1135 setSettingsValue(connwrap, "lastwriteprogver.branchtag", Consts.VERSION_TUPLE[0]) 1136 setSettingsValue(connwrap, "lastwriteprogver.major", str(Consts.VERSION_TUPLE[1])) 1137 setSettingsValue(connwrap, "lastwriteprogver.minor", str(Consts.VERSION_TUPLE[2])) 1138 setSettingsValue(connwrap, "lastwriteprogver.sub", str(Consts.VERSION_TUPLE[3])) 1139 setSettingsValue(connwrap, "lastwriteprogver.patch", str(Consts.VERSION_TUPLE[4])) 1132 1140 except IOError: 1133 1141 pass -
branches/mbutscher/work/lib/pwiki/wikidata/original_sqlite/DbStructure.py
r217 r234 11 11 from os.path import exists, join 12 12 13 import Consts 13 14 from pwiki.WikiExceptions import * 14 15 from pwiki.StringOps import mbcsDec, mbcsEnc, utf8Enc, utf8Dec, applyBinCompact, \ … … 995 996 """ 996 997 try: 997 # Write which version at last wrote to database998 # Write which format version at last wrote to database 998 999 connwrap.execSql("insert or replace into settings(key, value) " 999 1000 "values ('lastwritever', '"+str(VERSION_DB)+"')") 1001 1002 # Write which program version at last wrote to database 1003 connwrap.execSql("insert or replace into settings(key, value) " 1004 "values ('lastwriteprogver.branchtag', '"+Consts.VERSION_TUPLE[0]+"')") 1005 connwrap.execSql("insert or replace into settings(key, value) " 1006 "values ('lastwriteprogver.major', '"+str(Consts.VERSION_TUPLE[1])+"')") 1007 connwrap.execSql("insert or replace into settings(key, value) " 1008 "values ('lastwriteprogver.minor', '"+str(Consts.VERSION_TUPLE[2])+"')") 1009 connwrap.execSql("insert or replace into settings(key, value) " 1010 "values ('lastwriteprogver.sub', '"+str(Consts.VERSION_TUPLE[3])+"')") 1011 connwrap.execSql("insert or replace into settings(key, value) " 1012 "values ('lastwriteprogver.patch', '"+str(Consts.VERSION_TUPLE[4])+"')") 1000 1013 except sqlite.ReadOnlyDbError: 1001 1014 pass -
branches/mbutscher/work/lib/whoosh/analysis.py
r230 r234 54 54 """ 55 55 56 import copy, re 56 57 from array import array 57 import copy, re58 from collections import deque 58 59 from itertools import chain 59 60 … … 83 84 # Token object 84 85 86 87 # Mine: 85 88 class Token(object): 89 __slots__ = ( 90 "positions", 91 "chars", 92 "stopped", 93 "boost", 94 "removestops", 95 "mode", 96 97 "startchar", 98 "endchar", 99 "text", 100 "original", 101 102 "__dict__" 103 ) 104 105 86 106 """ 87 107 Represents a "token" (usually a word) extracted from the source text being … … 113 133 114 134 def __init__(self, positions=False, chars=False, removestops=True, mode='', 115 **kwargs): 135 stopped=False, boost=1.0, startchar=None, endchar=None, 136 text=None, original=None, **kwargs): 116 137 """ 117 138 :param positions: Whether tokens should have the token position in the … … 123 144 :param mode: contains a string describing the purpose for which the 124 145 analyzer is being called, i.e. 'index' or 'query'. 125 """ 126 146 147 Do not modify the parameters after mode. They are needed for 148 copying only 149 """ 127 150 self.positions = positions 128 151 self.chars = chars 129 self.stopped = False130 self.boost = 1.0131 152 self.removestops = removestops 132 153 self.mode = mode 133 self.__dict__.update(kwargs) 134 135 def __repr__(self): 154 155 self.stopped = stopped 156 self.boost = boost 157 158 self.startchar = startchar 159 self.endchar = endchar 160 self.text = text 161 self.original = original 162 163 if kwargs: 164 self.__dict__.update(kwargs) 165 166 167 def __repr__(self): # TODO!!! 136 168 parms = ", ".join("%s=%r" % (name, value) 137 169 for name, value in self.__dict__.iteritems()) … … 139 171 140 172 def copy(self): 141 return copy.copy(self) 173 return Token(positions=self.positions, chars=self.chars, 174 removestops=self.removestops, mode=self.mode, 175 stopped=self.stopped, boost=self.boost, startchar=self.startchar, 176 endchar=self.endchar, text=self.text, original=self.original, 177 **self.__dict__) 178 179 180 181 # mchaput: 182 # class Token(object): 183 # """ 184 # Represents a "token" (usually a word) extracted from the source text being 185 # indexed. 186 # 187 # See "Advanced analysis" in the user guide for more information. 188 # 189 # Because object instantiation in Python is slow, tokenizers should create 190 # ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes 191 # each time. 192 # 193 # This trick means that consumers of tokens (i.e. filters) must never try to 194 # hold onto the token object between loop iterations, or convert the token 195 # generator into a list. Instead, save the attributes between iterations, 196 # not the object:: 197 # 198 # def RemoveDuplicatesFilter(self, stream): 199 # # Removes duplicate words. 200 # lasttext = None 201 # for token in stream: 202 # # Only yield the token if its text doesn't 203 # # match the previous token. 204 # if lasttext != token.text: 205 # yield token 206 # lasttext = token.text 207 # 208 # ...or, call token.copy() to get a copy of the token object. 209 # """ 210 # 211 # def __init__(self, positions=False, chars=False, removestops=True, mode='', 212 # **kwargs): 213 # """ 214 # :param positions: Whether tokens should have the token position in the 215 # 'pos' attribute. 216 # :param chars: Whether tokens should have character offsets in the 217 # 'startchar' and 'endchar' attributes. 218 # :param removestops: whether to remove stop words from the stream (if 219 # the tokens pass through a stop filter). 220 # :param mode: contains a string describing the purpose for which the 221 # analyzer is being called, i.e. 'index' or 'query'. 222 # """ 223 # 224 # self.positions = positions 225 # self.chars = chars 226 # self.stopped = False 227 # self.boost = 1.0 228 # self.removestops = removestops 229 # self.mode = mode 230 # self.__dict__.update(kwargs) 231 # 232 # def __repr__(self): 233 # parms = ", ".join("%s=%r" % (name, value) 234 # for name, value in self.__dict__.iteritems()) 235 # return "%s(%s)" % (self.__class__.__name__, parms) 236 # 237 # def copy(self): 238 # # This is faster than using the copy module 239 # return Token(**self.__dict__.copy()) 240 241 242 243 # mchaput modif: 244 # class Token(object): 245 # """ 246 # Represents a "token" (usually a word) extracted from the source text being 247 # indexed. 248 # 249 # See "Advanced analysis" in the user guide for more information. 250 # 251 # Because object instantiation in Python is slow, tokenizers should create 252 # ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes 253 # each time. 254 # 255 # This trick means that consumers of tokens (i.e. filters) must never try to 256 # hold onto the token object between loop iterations, or convert the token 257 # generator into a list. Instead, save the attributes between iterations, 258 # not the object:: 259 # 260 # def RemoveDuplicatesFilter(self, stream): 261 # # Removes duplicate words. 262 # lasttext = None 263 # for token in stream: 264 # # Only yield the token if its text doesn't 265 # # match the previous token. 266 # if lasttext != token.text: 267 # yield token 268 # lasttext = token.text 269 # 270 # ...or, call token.copy() to get a copy of the token object. 271 # """ 272 # 273 # def __init__(self, positions=False, chars=False, removestops=True, mode='', 274 # **kwargs): 275 # """ 276 # :param positions: Whether tokens should have the token position in the 277 # 'pos' attribute. 278 # :param chars: Whether tokens should have character offsets in the 279 # 'startchar' and 'endchar' attributes. 280 # :param removestops: whether to remove stop words from the stream (if 281 # the tokens pass through a stop filter). 282 # :param mode: contains a string describing the purpose for which the 283 # analyzer is being called, i.e. 'index' or 'query'. 284 # """ 285 # 286 # self.positions = positions 287 # self.chars = chars 288 # self.stopped = False 289 # self.boost = 1.0 290 # self.removestops = removestops 291 # self.mode = mode 292 # self.__dict__.update(kwargs) 293 # 294 # def __repr__(self): 295 # parms = ", ".join("%s=%r" % (name, value) 296 # for name, value in self.__dict__.iteritems()) 297 # return "%s(%s)" % (self.__class__.__name__, parms) 298 # 299 # def copy(self): 300 # # This is faster than using the copy module 301 # return Token(**self.__dict__) 302 303 304 142 305 143 306 … … 184 347 t = Token(positions, chars, removestops=removestops, mode=mode) 185 348 t.text = value 349 t.boost=1.0 186 350 if keeporiginal: 187 351 t.original = value … … 203 367 """ 204 368 205 __inittypes__ = dict(expression=unicode, gaps=bool )206 207 def __init__(self, expression=r"\w+(\.?\w+)*", gaps=False ):369 __inittypes__ = dict(expression=unicode, gaps=bool, lowercase=bool) 370 371 def __init__(self, expression=r"\w+(\.?\w+)*", gaps=False, lowercase=False): 208 372 """ 209 373 :param expression: A regular expression object or string. Each match … … 214 378 than matching on the expression. 215 379 """ 216 380 217 381 if isinstance(expression, basestring): 218 382 self.expression = re.compile(expression, re.UNICODE) … … 220 384 self.expression = expression 221 385 self.gaps = gaps 386 387 self.lowercase = lowercase 388 222 389 223 390 def __eq__(self, other): … … 245 412 246 413 assert isinstance(value, unicode), "%r is not unicode" % value 247 414 415 if self.lowercase: 416 lowervalue = value.lower() 417 else: 418 lowervalue = value 419 248 420 t = Token(positions, chars, removestops=removestops, mode=mode) 249 421 if not tokenize: 250 t.original = t.text = value 422 t.original = value 423 t.text = lowervalue 424 t.boost = 1.0 251 425 if positions: t.pos = start_pos 252 426 if chars: … … 256 430 elif not self.gaps: 257 431 # The default: expression matches are used as tokens 258 for pos, match in enumerate(self.expression.finditer(value)): 259 t.text = match.group(0) 432 for pos, match in enumerate(self.expression.finditer(lowervalue)): 433 ms = match.start() 434 me = match.end() 435 t.text = lowervalue[ms:me] # match.group(0) 260 436 if keeporiginal: 261 t.original = t.text437 t.original = value[ms:me] 262 438 t.stopped = False 263 439 if positions: 264 440 t.pos = start_pos + pos 265 441 if chars: 266 t.startchar = start_char + m atch.start()267 t.endchar = start_char + m atch.end()442 t.startchar = start_char + ms 443 t.endchar = start_char + me 268 444 yield t 269 445 else: … … 272 448 prevend = 0 273 449 pos = start_pos 274 for match in self.expression.finditer( value):450 for match in self.expression.finditer(lowervalue): 275 451 start = prevend 276 452 end = match.start() 277 text = value[start:end]453 text = lowervalue[start:end] 278 454 if text: 279 455 t.text = text 456 t.boost = 1.0 280 457 if keeporiginal: 281 t.original = t.text458 t.original = value[start:end] 282 459 t.stopped = False 283 460 if positions: … … 296 473 if prevend < len(value): 297 474 t.text = value[prevend:] 475 t.boost = 1.0 298 476 if keeporiginal: 299 477 t.original = t.text … … 365 543 if not tokenize: 366 544 t.original = t.text = value 545 t.boost = 1.0 367 546 if positions: t.pos = start_pos 368 547 if chars: … … 382 561 if currentchar > startchar: 383 562 t.text = text 563 t.boost = 1.0 384 564 if keeporiginal: 385 565 t.original = t.text … … 398 578 if currentchar > startchar: 399 579 t.text = value[startchar:currentchar] 580 t.boost = 1.0 400 581 if keeporiginal: 401 582 t.original = t.text … … 1169 1350 "the-sign", "sign-of", "of-four" 1170 1351 1171 This can be used in fields dedicated to phrase searching. In the example 1172 above, the three "bi-word" tokens will be faster to find than the four 1173 original words since there are fewer of them and they will be much less 1174 frequent (especially compared to words like "the" and "of"). 1352 This can be used to create fields for pseudo-phrase searching, where if 1353 all the terms match the document probably contains the phrase, but the 1354 searching is faster than actually doing a phrase search on individual word 1355 terms. 1356 1357 The ``BiWordFilter`` is much faster than using the otherwise equivalent 1358 ``ShingleFilter(2)``. 1175 1359 """ 1176 1360 … … 1213 1397 if positions: prev_pos = ps 1214 1398 1215 # If atno bi-words were emitted, that is, the token stream only had1399 # If no bi-words were emitted, that is, the token stream only had 1216 1400 # a single token, then emit that single token. 1217 1401 if not atleastone: 1218 1402 yield token 1219 1403 1404 1405 class ShingleFilter(Filter): 1406 """Merges a certain number of adjacent tokens into multi-word tokens, so 1407 that for example:: 1408 1409 "better", "a", "witty", "fool", "than", "a", "foolish", "wit" 1410 1411 with ``ShingleFilter(3, ' ')`` becomes:: 1412 1413 'better a witty', 'a witty fool', 'witty fool than', 'fool than a', 1414 'than a foolish', 'a foolish wit' 1415 1416 This can be used to create fields for pseudo-phrase searching, where if 1417 all the terms match the document probably contains the phrase, but the 1418 searching is faster than actually doing a phrase search on individual word 1419 terms. 1420 1421 If you're using two-word shingles, you should use the functionally 1422 equivalent ``BiWordFilter`` instead because it's faster than 1423 ``ShingleFilter``. 1424 """ 1425 1426 def __init__(self, size=2, sep="-"): 1427 self.size = size 1428 self.sep = sep 1429 1430 def __call__(self, tokens): 1431 size = self.size 1432 sep = self.sep 1433 buf = deque() 1434 atleastone = False 1435 1436 def make_token(): 1437 tk = buf[0] 1438 tk.text = sep.join([t.text for t in buf]) 1439 if tk.chars: 1440 tk.endchar = buf[-1].endchar 1441 return tk 1442 1443 for token in tokens: 1444 buf.append(token.copy()) 1445 if len(buf) == size: 1446 atleastone = True 1447 yield make_token() 1448 buf.popleft() 1449 1450 # If no shingles were emitted, that is, the token stream had fewer than 1451 # 'size' tokens, then emit a single token with whatever tokens there 1452 # were 1453 if not atleastone: 1454 yield make_token() 1455 1220 1456 1221 1457 class BoostTextFilter(Filter): … … 1260 1496 1261 1497 1262 class Delimi natedAttributeFilter(Filter):1498 class DelimitedAttributeFilter(Filter): 1263 1499 """Looks for delimiter characters in the text of each token and stores the 1264 1500 data after the delimiter in a named attribute on the token. … … 1323 1559 1324 1560 class DoubleMetaphoneFilter(Filter): 1325 def __init__(self, primary_boost=3.0): 1561 """Transforms the text of the tokens using Lawrence Philips's Double 1562 Metaphone algorithm. This algorithm attempts to encode words in such a way 1563 that similar-sounding words reduce to the same code. This may be useful for 1564 fields containing the names of people and places, and other uses where 1565 tolerance of spelling differences is desireable. 1566 """ 1567 1568 def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): 1569 """ 1570 :param primary_boost: the boost to apply to the token containing the 1571 primary code. 1572 :param secondary_boost: the boost to apply to the token containing the 1573 secondary code, if any. 1574 :param combine: if True, the original unencoded tokens are kept in the 1575 stream, preceding the encoded tokens. 1576 """ 1577 1326 1578 self.primary_boost = primary_boost 1579 self.secondary_boost = secondary_boost 1580 self.combine = combine 1327 1581 1328 1582 def __eq__(self, other): … … 1333 1587 def __call__(self, tokens): 1334 1588 primary_boost = self.primary_boost 1589 secondary_boost = self.secondary_boost 1590 combine = self.combine 1335 1591 1336 1592 for t in tokens: 1593 if combine: 1594 yield t 1595 1337 1596 primary, secondary = double_metaphone(t.text) 1597 b = t.boost 1598 # Overwrite the token's text and boost and yield it 1338 1599 if primary: 1339 # Save the original boost1340 b = t.boost1341 # Overwrite the token's text and boost and yield it1342 1600 t.text = primary 1343 1601 t.boost = b * primary_boost 1344 1602 yield t 1345 # Restored the original boost1346 t.boost = b1347 1603 if secondary: 1348 1604 t.text = secondary 1605 t.boost = b * secondary_boost 1349 1606 yield t 1607 1608 1609 class SubstitutionFilter(Filter): 1610 """Performas a regular expression substitution on the token text. 1611 1612 This is especially useful for removing text from tokens, for example 1613 hyphens:: 1614 1615 ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") 1616 1617 Because it has the full power of the re.sub() method behind it, this filter 1618 can perform some fairly complex transformations. For example, to take tokens 1619 like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', 'f=e'``:: 1620 1621 # Analyzer that swaps the text on either side of an equal sign 1622 ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") 1623 """ 1624 1625 def __init__(self, pattern, replacement): 1626 """ 1627 :param pattern: a pattern string or compiled regular expression object 1628 describing the text to replace. 1629 :param replacement: the substitution text. 1630 """ 1631 1632 if isinstance(pattern, basestring): 1633 pattern = re.compile(pattern, re.UNICODE) 1634 self.pattern = pattern 1635 self.replacement = replacement 1636 1637 def __eq__(self, other): 1638 return (other and self.__class__ is other.__class__ 1639 and self.pattern == other.pattern 1640 and self.replacement == other.replacement) 1641 1642 def __call__(self, tokens): 1643 pattern = self.pattern 1644 replacement = self.replacement 1645 1646 for t in tokens: 1647 t.text = pattern.sub(replacement, t.text) 1648 yield t 1649 1350 1650 1351 1651 # Analyzers … … 1483 1783 """ 1484 1784 1485 ret = RegexTokenizer(expression=expression, gaps=gaps) 1486 chain = ret | LowercaseFilter() 1785 # ret = RegexTokenizer(expression=expression, gaps=gaps) 1786 # chain = ret | LowercaseFilter() 1787 chain = RegexTokenizer(expression=expression, gaps=gaps, lowercase=True) 1487 1788 if stoplist is not None: 1488 1789 chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, -
branches/mbutscher/work/lib/whoosh/fields.py
r230 r234 65 65 being updated. 66 66 67 * multitoken_query is a string indicating what kind of query to use when 68 a "word" in a user query parses into multiple tokens. The string is 69 interpreted by the query parser. The strings understood by the default 70 query parser are "first" (use first token only), "and" (join the tokens 71 with an AND query), "or" (join the tokens with OR), and "phrase" (join 72 the tokens with a phrase query). 73 67 74 The constructor for the base field type simply lets you supply your own 68 75 configured field format, vector format, and scorable and stored values. 69 76 Subclasses may configure some or all of this for you. 70 71 77 """ 72 78 73 79 format = vector = scorable = stored = unique = None 74 80 indexed = True 81 multitoken_query = "first" 75 82 __inittypes__ = dict(format=Format, vector=Format, 76 83 scorable=bool, stored=bool, unique=bool) 77 84 78 85 def __init__(self, format, vector=None, scorable=False, stored=False, 79 unique=False ):86 unique=False, multitoken_query="first"): 80 87 self.format = format 81 88 self.vector = vector … … 83 90 self.stored = stored 84 91 self.unique = unique 92 self.multitoken_query = multitoken_query 85 93 86 94 def __repr__(self): -
branches/mbutscher/work/lib/whoosh/filedb/fileindex.py
r231 r234 144 144 145 145 # Generation 146 stream.read_int() 146 index_gen = stream.read_int() 147 assert gen == index_gen 147 148 148 149 segment_counter = stream.read_int() … … 156 157 157 158 def _next_segment_name(self): 158 #Returns the name of the next segment in sequence.159 if self.segment_num_lock is None:160 self.segment_num_lock = Lock()161 162 if self.segment_num_lock.acquire():163 try:164 self.segment_counter += 1165 return166 finally:167 self.segment_num_lock.release()168 else:169 raise LockError159 #Returns the name of the next segment in sequence. 160 if self.segment_num_lock is None: 161 self.segment_num_lock = Lock() 162 163 if self.segment_num_lock.acquire(): 164 try: 165 self.segment_counter += 1 166 return 167 finally: 168 self.segment_num_lock.release() 169 else: 170 raise LockError 170 171 171 172 172 173 def _clean_files(storage, indexname, gen, segments): 173 # Attempts to remove unused index files (called when a new generation174 # is created). If existing Index and/or reader objects have the files175 # open, they may not be deleted immediately (i.e. on Windows) but will176 # probably be deleted eventually by a later call to clean_files.177 178 current_segment_names = set(s.name for s in segments)179 180 tocpattern = _toc_pattern(indexname)181 segpattern = _segment_pattern(indexname)182 183 todelete = set()184 for filename in storage:185 tocm = tocpattern.match(filename)186 segm = segpattern.match(filename)187 if tocm:188 if int(tocm.group(1)) != gen:189 todelete.add(filename)190 elif segm:191 name = segm.group(1)192 if name not in current_segment_names:193 todelete.add(filename)194 195 for filename in todelete:196 try:197 storage.delete_file(filename)198 except OSError:199 # Another process still has this file open200 pass174 # Attempts to remove unused index files (called when a new generation 175 # is created). If existing Index and/or reader objects have the files 176 # open, they may not be deleted immediately (i.e. on Windows) but will 177 # probably be deleted eventually by a later call to clean_files. 178 179 current_segment_names = set(s.name for s in segments) 180 181 tocpattern = _toc_pattern(indexname) 182 segpattern = _segment_pattern(indexname) 183 184 todelete = set() 185 for filename in storage: 186 tocm = tocpattern.match(filename) 187 segm = segpattern.match(filename) 188 if tocm: 189 if int(tocm.group(1)) != gen: 190 todelete.add(filename) 191 elif segm: 192 name = segm.group(1) 193 if name not in current_segment_names: 194 todelete.add(filename) 195 196 for filename in todelete: 197 try: 198 storage.delete_file(filename) 199 except OSError: 200 # Another process still has this file open 201 pass 201 202 202 203 -
branches/mbutscher/work/lib/whoosh/filedb/filepostings.py
r231 r234 334 334 self.stringids = stringids 335 335 336 assert postfile.get_int(offset) == -48626 336 magic = postfile.get_int(offset) 337 assert magic == -48626 338 337 339 self.blockcount = postfile.get_uint(offset + _INT_SIZE) 338 340 self.baseoffset = offset + _INT_SIZE * 2 -
branches/mbutscher/work/lib/whoosh/formats.py
r231 r234 187 187 for t in unstopped(self.analyzer(value, boosts=True, **kwargs)): 188 188 freqs[t.text] += 1 189 weights[t.text] += int(t.boost)189 weights[t.text] += t.boost 190 190 191 191 encode = self.encode -
branches/mbutscher/work/lib/whoosh/highlight.py
r232 r234 22 22 from __future__ import division 23 23 from heapq import nlargest 24 from cgi import escape as htmlescape 24 from collections import deque 25 26 # from cgi import escape as htmlescape 27 from pwiki.StringOps import escapeHtml as htmlescape 25 28 26 29 # Fragment object … … 187 190 """ 188 191 189 def __init__(self, termset, maxchars=200, surround=20):192 def __init__(self, termset, maxchars=200, charsbefore=20, charsafter=20): 190 193 """ 191 194 :param termset: A collection (probably a set or frozenset) containing … … 193 196 :param maxchars: The maximum number of characters allowed in a 194 197 fragment. 195 :param surround: The number of extra characters of context to add both 196 before the first matched term and after the last matched term. 197 """ 198 198 :param charsbefore: The number of extra characters of context to add 199 before the first matched term . 200 :param charsafter: The number of extra characters of context to add 201 after the last matched term. 202 """ 199 203 self.maxchars = maxchars 200 self.charsbefore = self.charsafter = surround 204 self.charsbefore = charsbefore 205 self.charsafter = charsafter 206 201 207 202 208 def __call__(self, text, tokens): … … 205 211 charsafter = self.charsafter 206 212 207 current = []213 current = deque() 208 214 currentlen = 0 209 215 countdown = -1 … … 222 228 if countdown < 0 or currentlen >= maxchars: 223 229 yield Fragment(current) 224 current = []230 current.clear() 225 231 currentlen = 0 226 232 227 233 else: 228 while current and currentlen > charsbefore: 229 t = current.pop(0) 234 # while current and currentlen > charsbefore: len(current) can't be 0 if currentlen > 0 and 235 # charsbefore should be >= 0 236 while currentlen > charsbefore: 237 t = current.popleft() 230 238 currentlen -= t.endchar - t.startchar 231 239 … … 340 348 341 349 350 class SimpleHtmlFormatter(object): 351 """Returns a string in which the matched terms are enclosed in <b></b>. 352 """ 353 354 def __init__(self, between=u"... "): 355 """ 356 :param between: the text to add between fragments. 357 """ 358 self.between = between 359 self.firstPos = -1 360 361 def _format_fragment(self, text, fragment): 362 output = [] 363 index = fragment.startchar 364 365 for t in fragment.matches: 366 if t.startchar > index: 367 output.append(htmlescape(text[index:t.startchar])) 368 369 ttxt = htmlescape(text[t.startchar:t.endchar]) 370 if t.matched: 371 ttxt = "<b>%s</b>" % ttxt 372 if self.firstPos == -1: 373 self.firstPos = t.startchar 374 else: 375 self.firstPos = min(self.firstPos, t.startchar) 376 377 output.append(ttxt) 378 index = t.endchar 379 380 output.append(htmlescape(text[index:fragment.endchar])) 381 return u"".join(output) 382 383 def __call__(self, text, fragments): 384 return self.between.join([self._format_fragment(text, fragment) 385 for fragment in fragments]), self.firstPos 386 387 388 389 342 390 class HtmlFormatter(object): 343 391 """Returns a string containing HTML formatting around the matched terms. … … 428 476 429 477 430 class GenshiFormatter(object):431 """Returns a Genshi event stream containing HTML formatting around the432 matched terms.433 """434 435 def __init__(self, qname="strong", between="..."):436 """437 :param qname: the QName for the tag to wrap around matched terms.438 :param between: the text to add between fragments.439 """440 441 self.qname = qname442 self.between = between443 444 from genshi.core import START, END, TEXT, Attrs, Stream #@UnresolvedImport445 self.START, self.END, self.TEXT = START, END, TEXT446 self.Attrs, self.Stream = Attrs, Stream447 448 def _add_text(self, text, output):449 if output and output[-1][0] == self.TEXT:450 output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])451 else:452 output.append((self.TEXT, text, (None, -1, -1)))453 454 def _format_fragment(self, text, fragment):455 START, TEXT, END, Attrs = self.START, self.TEXT, self.END, self.Attrs456 qname = self.qname457 output = []458 459 index = fragment.startchar460 lastmatched = False461 for t in fragment.matches:462 if t.startchar > index:463 if lastmatched:464 output.append((END, qname, (None, -1, -1)))465 lastmatched = False466 self._add_text(text[index:t.startchar], output)467 468 ttxt = text[t.startchar:t.endchar]469 if not lastmatched:470 output.append((START, (qname, Attrs()), (None, -1, -1)))471 lastmatched = True472 output.append((TEXT, ttxt, (None, -1, -1)))473 474 index = t.endchar475 476 if lastmatched:477 output.append((END, qname, (None, -1, -1)))478 479 return output480 481 def __call__(self, text, fragments):482 output = []483 first = True484 for fragment in fragments:485 if not first:486 self._add_text(self.between, output)487 first = False488 output += self._format_fragment(text, fragment)489 490 return self.Stream(output)478 # class GenshiFormatter(object): 479 # """Returns a Genshi event stream containing HTML formatting around the 480 # matched terms. 481 # """ 482 # 483 # def __init__(self, qname="strong", between="..."): 484 # """ 485 # :param qname: the QName for the tag to wrap around matched terms. 486 # :param between: the text to add between fragments. 487 # """ 488 # 489 # self.qname = qname 490 # self.between = between 491 # 492 # from genshi.core import START, END, TEXT, Attrs, Stream #@UnresolvedImport 493 # self.START, self.END, self.TEXT = START, END, TEXT 494 # self.Attrs, self.Stream = Attrs, Stream 495 # 496 # def _add_text(self, text, output): 497 # if output and output[-1][0] == self.TEXT: 498 # output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2]) 499 # else: 500 # output.append((self.TEXT, text, (None, -1, -1))) 501 # 502 # def _format_fragment(self, text, fragment): 503 # START, TEXT, END, Attrs = self.START, self.TEXT, self.END, self.Attrs 504 # qname = self.qname 505 # output = [] 506 # 507 # index = fragment.startchar 508 # lastmatched = False 509 # for t in fragment.matches: 510 # if t.startchar > index: 511 # if lastmatched: 512 # output.append((END, qname, (None, -1, -1))) 513 # lastmatched = False 514 # self._add_text(text[index:t.startchar], output) 515 # 516 # ttxt = text[t.startchar:t.endchar] 517 # if not lastmatched: 518 # output.append((START, (qname, Attrs()), (None, -1, -1))) 519 # lastmatched = True 520 # output.append((TEXT, ttxt, (None, -1, -1))) 521 # 522 # index = t.endchar 523 # 524 # if lastmatched: 525 # output.append((END, qname, (None, -1, -1))) 526 # 527 # return output 528 # 529 # def __call__(self, text, fragments): 530 # output = [] 531 # first = True 532 # for fragment in fragments: 533 # if not first: 534 # self._add_text(self.between, output) 535 # first = False 536 # output += self._format_fragment(text, fragment) 537 # 538 # return self.Stream(output) 491 539 492 540 -
branches/mbutscher/work/lib/whoosh/lang/porter2.py
r230 r234 1 # Copyright (c) 2008 Michael Dirolf (mike at dirolf dot com)2 3 # Permission is hereby granted, free of charge, to any person4 # obtaining a copy of this software and associated documentation5 # files (the "Software"), to deal in the Software without6 # restriction, including without limitation the rights to use,7 # copy, modify, merge, publish, distribute, sublicense, and/or sell8 # copies of the Software, and to permit persons to whom the9 # Software is furnished to do so, subject to the following10 # conditions:11 12 # The above copyright notice and this permission notice shall be13 # included in all copies or substantial portions of the Software.14 15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,16 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES17 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND18 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT19 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,20 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING21 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR22 # OTHER DEALINGS IN THE SOFTWARE.23 24 1 """An implementation of the Porter2 stemming algorithm. 25 2 See http://snowball.tartarus.org/algorithms/english/stemmer.html … … 29 6 This algorithm is more correct but (at least in this implementation) 30 7 several times slower than the original porter algorithm as implemented 31 in whoosh.lang.porter.8 in stemming.porter. 32 9 """ 33 10 … … 146 123 147 124 def step_1c(word): 148 if word.endswith('y') or word.endswith('Y') :125 if word.endswith('y') or word.endswith('Y') and len(word) > 1: 149 126 if word[-2] not in 'aeiouy': 150 127 if len(word) > 2: … … 305 282 return word 306 283 307 284 285 -
branches/mbutscher/work/lib/whoosh/matching.py
r231 r234 252 252 253 253 def __init__(self, ids, weights=None, values=None, format=None, 254 scorer=None, position=0 ):254 scorer=None, position=0, all_weights=None): 255 255 """ 256 256 :param ids: a list of doc IDs. … … 267 267 self._ids = ids 268 268 self._weights = weights 269 self._all_weights = all_weights 269 270 self._values = values 270 271 self._i = position … … 319 320 320 321 def weight(self): 321 if self._weights: 322 if self._all_weights: 323 return self._all_weights 324 elif self._weights: 322 325 return self._weights[self._i] 323 326 else: … … 1023 1026 child.skip_to(self._id) 1024 1027 1025 while (self._id < self.limit 1026 and ((child.is_active() and self._id == child.id()) 1027 or missing(self._id))): 1028 self._id += 1 1029 if child.is_active(): 1028 # While self._id is missing or is in the child matcher, increase it 1029 while child.is_active() and self._id < self.limit: 1030 if missing(self._id): 1031 self._id += 1 1032 continue 1033 1034 if self._id == child.id(): 1035 self._id += 1 1030 1036 child.next() 1031 1037 continue 1038 1039 break 1040 1032 1041 def id(self): 1033 1042 return self._id -
branches/mbutscher/work/lib/whoosh/qparser/default.py
r231 r234 20 20 """ 21 21 22 import re23 24 22 from whoosh import query 25 26 27 class QueryParserError(Exception): 28 def __init__(self, cause, msg=None): 29 super(QueryParserError, self).__init__(str(cause)) 30 self.cause = cause 31 32 33 def rcompile(pattern, flags=0): 34 if not isinstance(pattern, basestring): 35 # If it's not a string, assume it's already a compiled pattern 36 return pattern 37 return re.compile(pattern, re.UNICODE | flags) 38 39 40 def get_single_text(field, text, **kwargs): 41 # Just take the first token 42 for t in field.process_text(text, mode="query", **kwargs): 43 return t 23 from whoosh.qparser.syntax import * 24 from whoosh.qparser.plugins import * 44 25 45 26 … … 48 29 49 30 50 class SyntaxObject(object): 51 """An object representing parsed text. These objects generally correspond 52 to a query object type, and are intermediate objects used to represent 53 the syntax tree parsed from a query string, and then generate a query 54 tree from the syntax tree. There will be syntax objects that do not have 55 a corresponding query type, such as the object representing whitespace. 56 """ 57 58 def query(self, parser): 59 """Returns a query object tree representing this parser object. 60 """ 61 62 raise NotImplementedError 63 64 65 # Grouping objects 66 67 class Group(SyntaxObject): 68 """An object representing a group of objects. These generally correspond 69 to compound query objects such as ``query.And`` and ``query.Or``. 70 """ 71 72 def __init__(self, tokens=None, boost=1.0): 73 if tokens: 74 self.tokens = tokens 75 else: 76 self.tokens = [] 77 self.boost = boost 78 79 def __repr__(self): 80 r = "%s(%r)" % (self.__class__.__name__, self.tokens) 81 if self.boost != 1.0: 82 r += "^%s" % self.boost 83 return r 84 85 def __nonzero__(self): 86 return bool(self.tokens) 87 88 def __iter__(self): 89 return iter(self.tokens) 90 91 def __len__(self): 92 return len(self.tokens) 93 94 def __getitem__(self, n): 95 return self.tokens.__getitem__(n) 96 97 def __setitem__(self, n, v): 98 self.tokens.__setitem__(n, v) 99 100 def set_boost(self, b): 101 return self.__class__(self.tokens[:], boost=b) 102 103 def set_fieldname(self, name): 104 return self.__class__([t.set_fieldname(name) for t in self.tokens]) 105 106 def append(self, item): 107 self.tokens.append(item) 108 109 def extend(self, items): 110 self.tokens.extend(items) 111 112 def pop(self): 113 return self.tokens.pop() 114 115 def query(self, parser): 116 return self.qclass([t.query(parser) for t in self.tokens], 117 boost=self.boost) 118 119 def empty(self): 120 return self.__class__(boost=self.boost) 121 122 123 class AndGroup(Group): 124 """Syntax group corresponding to an And query. 125 """ 126 127 qclass = query.And 128 129 130 class OrGroup(Group): 131 """Syntax group corresponding to an Or query. 132 """ 133 134 qclass = query.Or 135 136 137 class AndNotGroup(Group): 138 """Syntax group corresponding to an AndNot query. 139 """ 140 141 def query(self, parser): 142 assert len(self.tokens) == 2 143 return query.AndNot(self.tokens[0].query(parser), 144 self.tokens[1].query(parser), boost=self.boost) 145 146 class AndMaybeGroup(Group): 147 """Syntax group corresponding to an AndMaybe query. 148 """ 149 150 def query(self, parser): 151 assert len(self.tokens) == 2 152 return query.AndMaybe(self.tokens[0].query(parser), 153 self.tokens[1].query(parser), boost=self.boost) 154 155 156 class DisMaxGroup(Group): 157 """Syntax group corresponding to a DisjunctionMax query. 158 """ 159 160 def __init__(self, tokens=None, tiebreak=0.0, boost=None): 161 super(DisMaxGroup, self).__init__(tokens) 162 self.tiebreak = tiebreak 163 164 def __repr__(self): 165 r = "dismax(%r" % self.tokens 166 if self.tiebreak != 0: 167 r += " tb=%s" % self.tiebreak 168 r += ")" 169 return r 170 171 def query(self, parser): 172 return query.DisjunctionMax([t.query(parser) for t in self.tokens], 173 tiebreak=self.tiebreak) 174 175 def empty(self): 176 return self.__class__(tiebreak=self.tiebreak) 177 178 179 class NotGroup(Group): 180 """Syntax group corresponding to a Not query. 181 """ 182 183 def __repr__(self): 184 return "NOT(%r)" % self.tokens 185 186 def query(self, parser): 187 assert len(self.tokens) == 1 188 return query.Not(self.tokens[0].query(parser)) 189 190 191 # Parse-able tokens 192 193 class Token(SyntaxObject): 194 """A parse-able token object. Each token class has an ``expr`` attribute 195 containing a regular expression that matches the token text. When this 196 expression is found, the class's ``create()`` class method is called and 197 returns a token object to represent the match in the syntax tree. When the 198 syntax tree is finished, the 199 """ 200 201 fieldname = None 202 endpos = None 203 204 def set_boost(self, b): 205 return self 206 207 def set_fieldname(self, name): 208 return self 209 210 @classmethod 211 def match(cls, text, pos): 212 return cls.expr.match(text, pos) 213 214 @classmethod 215 def create(cls, parser, match): 216 return cls() 217 218 def query(self, parser): 219 raise NotImplementedError 220 221 222 class Singleton(Token): 223 """Base class for tokens that don't carry any information specific to 224 each instance (e.g. "open paranthesis" token), so they can all share the 225 same instance. 226 """ 227 228 me = None 229 230 def __repr__(self): 231 return self.__class__.__name__ 232 233 @classmethod 234 def create(cls, parser, match): 235 if not cls.me: 236 cls.me = cls() 237 return cls.me 238 239 240 class White(Singleton): 241 expr = rcompile("\\s+") 242 243 244 class ErrorToken(Token): 245 """A token representing an unavoidable parsing error. The ``query()`` 246 method always returns NullQuery. 247 248 The default parser usually does not produce "errors" (text that doesn't 249 match the syntax is simply treated as part of the query), so this is mostly 250 for use by plugins that may add more restrictive parsing, for example 251 :class:`DateParserPlugin`. 252 253 Since the corresponding NullQuery will be filtered out when the query is 254 normalized, this is really only useful for debugging and possibly for 255 plugin filters. 256 257 The ``token`` attribute may contain the token that produced the error. 258 """ 259 260 def __init__(self, token): 261 self.token = token 262 263 def __repr__(self): 264 return "<%s (%r)>" % (self.__class__.__name__, self.token) 265 266 def query(self, parser): 267 return query.NullQuery 268 269 270 class BasicSyntax(Token): 271 """Base class for "basic" (atomic) syntax -- term, prefix, wildcard, 272 phrase, range. 273 """ 274 275 expr = None 276 qclass = None 277 tokenize = False 278 removestops = False 279 280 def __init__(self, text, fieldname=None, boost=1.0): 281 self.fieldname = fieldname 282 self.text = text 283 self.boost = boost 284 285 def set_boost(self, b): 286 return self.__class__(self.text, fieldname=self.fieldname, boost=b) 287 288 def set_fieldname(self, name): 289 if self.fieldname is None: 290 return self.__class__(self.text, fieldname=name, boost=self.boost) 291 else: 292 return self 293 294 def __repr__(self): 295 r = "%s:%r" % (self.fieldname, self.text) 296 if self.boost != 1.0: 297 r += "^%s" % self.boost 298 return r 299 300 @classmethod 301 def create(cls, parser, match): 302 return cls(match.group(0)) 303 304 def query(self, parser): 305 texts = (self.text, ) 306 fieldname = self.fieldname or parser.fieldname 307 cls = self.qclass or parser.termclass 308 309 if parser.schema and fieldname in parser.schema: 310 field = parser.schema[fieldname] 311 312 if field.self_parsing(): 313 try: 314 return field.parse_query(fieldname, self.text, 315 boost=self.boost) 316 except QueryParserError: 317 return query.NullQuery 318 319 texts = list(field.process_text(self.text, mode="query", 320 tokenize=self.tokenize, 321 removestops=self.removestops)) 322 323 if len(texts) > 1: 324 compound = parser.group.qclass 325 return compound([cls(fieldname, t, boost=self.boost) 326 for t in texts]) 327 elif texts and texts[0] is not None: 328 return cls(fieldname, texts[0], boost=self.boost) 329 else: 330 return query.NullQuery 331 332 333 class Word(BasicSyntax): 334 """Syntax object representing a term. 335 """ 336 337 expr = rcompile("[^ \t\r\n)]+") 338 tokenize = True 339 removestops = True 340 341 342 # Parser plugins 343 344 class Plugin(object): 345 """Base class for parser plugins. 346 """ 347 348 def tokens(self, parser): 349 """Returns a list of ``(token_class, priority)`` tuples to add to the 350 syntax the parser understands. 351 """ 352 353 return () 354 355 def filters(self, parser): 356 """Returns a list of ``(filter_function, priority)`` tuples to add to 357 parser. 358 """ 359 360 return () 361 362 363 class RangePlugin(Plugin): 364 """Adds the ability to specify term ranges. 365 366 This plugin has no configuration. 367 368 This plugin is included in the default parser configuration. 369 """ 370 371 def tokens(self, parser): 372 return ((RangePlugin.Range, 1), ) 373 374 class Range(Token): 375 expr = rcompile(r""" 376 (?P<open>\{|\[) # Open paren 377 378 ( # Begin optional "start" 379 ( # Begin choice between start1 and start2 380 ('(?P<start2>[^']+)') # Quoted start 381 | (?P<start1>[^ ]+) # ...or regular start 382 ) # End choice 383 [ ]+)? # Space at end of optional "start" 384 385 [Tt][Oo] # "to" between start and end 386 387 ([ ]+ # Space at start of optional "end" 388 ( # Begin choice between end1 and end2 389 ('(?P<end2>[^']+)') # Quoted end 390 | (?P<end1>[^\]\}]*) # ...or normal end 391 ) # End choice 392 )? # End of optional "end 393 394 (?P<close>\}|\]) # Close paren 395 """, re.VERBOSE) 396 397 def __init__(self, start, end, startexcl, endexcl, fieldname=None, boost=1.0): 398 self.fieldname = fieldname 399 self.start = start 400 self.end = end 401 self.startexcl = startexcl 402 self.endexcl = endexcl 403 self.boost = boost 404 405 def set_boost(self, b): 406 return self.__class__(self.start, self.end, self.startexcl, 407 self.endexcl, fieldname=self.fieldname, 408 boost=b) 409 410 def set_fieldname(self, name): 411 return self.__class__(self.start, self.end, self.startexcl, 412 self.endexcl, fieldname=name, 413 boost=self.boost) 414 415 def __repr__(self): 416 r = "%s:(%r, %r, %s, %s)" % (self.fieldname, self.start, self.end, 417 self.startexcl, self.endexcl) 418 if self.boost != 1.0: 419 r += "^%s" % self.boost 420 return r 421 422 @classmethod 423 def create(cls, parser, match): 424 start = match.group("start2") or match.group("start1") 425 end = match.group("end2") or match.group("end1") 426 return cls(start, end, startexcl=match.group("open") == "{", 427 endexcl=match.group("close") == "}") 428 429 def query(self, parser): 430 fieldname = self.fieldname or parser.fieldname 431 start, end = self.start, self.end 432 if parser.schema and fieldname in parser.schema: 433 field = parser.schema[fieldname] 434 435 if field.self_parsing(): 436 try: 437 rangeq = field.parse_range(fieldname, start, end, 438 self.startexcl, self.endexcl, 439 boost=self.boost) 440 if rangeq is not None: 441 return rangeq 442 except QueryParserError, e: 443 return query.NullQuery 444 445 if start: 446 start = get_single_text(field, start, tokenize=False, 447 removestops=False) 448 if end: 449 end = get_single_text(field, end, tokenize=False, 450 removestops=False) 451 452 if start is None: 453 start = u'' 454 if end is None: 455 end = u'\uFFFF' 456 457 return query.TermRange(fieldname, start, end, self.startexcl, 458 self.endexcl, boost=self.boost) 459 460 461 class PhrasePlugin(Plugin): 462 """Adds the ability to specify phrase queries inside double quotes. 463 464 This plugin has no configuration. 465 466 This plugin is included in the default parser configuration. 467 """ 468 469 def tokens(self, parser): 470 return ((PhrasePlugin.Quotes, 0), ) 471 472 class Quotes(BasicSyntax): 473 expr = rcompile('"(.*?)"') 474 475 def __init__(self, text, fieldname=None, boost=1.0, slop=1): 476 super(PhrasePlugin.Quotes, self).__init__(text, fieldname=fieldname, 477 boost=boost) 478 self.slop = slop 479 480 def __repr__(self): 481 r = "%s:q(%r)" % (self.fieldname, self.text) 482 if self.boost != 1.0: 483 r += "^%s" % self.boost 484 return r 485 486 @classmethod 487 def create(cls, parser, match): 488 slop = 1 489 #if match.group(5): 490 # try: 491 # slop = int(match.group(5)) 492 # except ValueError: 493 # pass 494 return cls(match.group(1), slop=slop) 495 496 def query(self, parser): 497 fieldname = self.fieldname or parser.fieldname 498 if parser.schema and fieldname in parser.schema: 499 field = parser.schema[fieldname] 500 #if field.self_parsing(): 501 # return field.parse_query(fieldname, self.text, boost=self.boost) 502 #else: 503 words = list(field.process_text(self.text, mode="query")) 504 else: 505 words = self.text.split(" ") 506 507 return parser.phraseclass(fieldname, words, boost=self.boost, 508 slop=self.slop) 509 510 511 class SingleQuotesPlugin(Plugin): 512 """Adds the ability to specify single "terms" containing spaces by 513 enclosing them in single quotes. 514 515 This plugin has no configuration. 516 517 This plugin is included in the default parser configuration. 518 """ 519 520 def tokens(self, parser): 521 return ((SingleQuotesPlugin.SingleQuotes, 0), ) 522 523 class SingleQuotes(Token): 524 expr = rcompile(r"(^|(?<=\W))'(.*?)'(?=\s|\]|[)}]|$)") 525 526 @classmethod 527 def create(cls, parser, match): 528 return Word(match.group(2)) 529 530 531 class PrefixPlugin(Plugin): 532 """Adds the ability to specify prefix queries by ending a term with an 533 asterisk. This plugin is useful if you want the user to be able to create 534 prefix but not wildcard queries (for performance reasons). If you are 535 including the wildcard plugin, you should not include this plugin as well. 536 """ 537 538 def tokens(self, parser): 539 return ((PrefixPlugin.Prefix, 0), ) 540 541 class Prefix(BasicSyntax): 542 expr = rcompile("[^ \t\r\n*]+\\*(?= |$|\\))") 543 qclass = query.Prefix 544 545 def __repr__(self): 546 r = "%s:pre(%r)" % (self.fieldname, self.text) 547 if self.boost != 1.0: 548 r += "^%s" % self.boost 549 return r 550 551 @classmethod 552 def create(cls, parser, match): 553 return cls(match.group(0)[:-1]) 554 555 556 class WildcardPlugin(Plugin): 557 """Adds the ability to specify wildcard queries by using asterisk and 558 question mark characters in terms. Note that these types can be very 559 performance and memory intensive. You may consider not including this 560 type of query. 561 562 This plugin is included in the default parser configuration. 563 """ 564 565 def tokens(self, parser): 566 return ((WildcardPlugin.Wild, 1), ) 567 568 class Wild(BasicSyntax): 569 # \u055E = Armenian question mark 570 # \u061F = Arabic question mark 571 # \u1367 = Ethiopic question mark 572 expr = rcompile(u"[^ \t\r\n*?\u055E\u061F\u1367]*[*?\u055E\u061F\u1367]\\S*") 573 qclass = query.Wildcard 574 575 def __repr__(self): 576 r = "%s:wild(%r)" % (self.fieldname, self.text) 577 if self.boost != 1.0: 578 r += "^%s" % self.boost 579 return r 580 581 @classmethod 582 def create(cls, parser, match): 583 return cls(match.group(0)) 584 585 586 class WhitespacePlugin(Plugin): 587 """Parses whitespace between words in the query string. You should always 588 include this plugin. 589 590 This plugin is always automatically included by the QueryParser. 591 """ 592 593 def __init__(self, tokenclass=White): 594 self.tokenclass = tokenclass 595 596 def tokens(self, parser): 597 return ((self.tokenclass, 100), ) 598 599 def filters(self, parser): 600 return ((self.do_whitespace, 500), ) 601 602 def do_whitespace(self, parser, stream): 603 newstream = stream.empty() 604 for t in stream: 605 if isinstance(t, Group): 606 newstream.append(self.do_whitespace(parser, t)) 607 elif not isinstance(t, self.tokenclass): 608 newstream.append(t) 609 return newstream 610 611 612 class GroupPlugin(Plugin): 613 """Adds the ability to group clauses using parentheses. 614 615 This plugin is included in the default parser configuration. 616 """ 617 618 def tokens(self, parser): 619 return ((GroupPlugin.Open, 0), (GroupPlugin.Close, 0)) 620 621 def filters(self, parser): 622 return ((GroupPlugin.do_groups, 0), ) 623 624 @staticmethod 625 def do_groups(parser, stream): 626 stack = [parser.group()] 627 for t in stream: 628 if isinstance(t, GroupPlugin.Open): 629 stack.append(parser.group()) 630 elif isinstance(t, GroupPlugin.Close): 631 if len(stack) > 1: 632 last = stack.pop() 633 stack[-1].append(last) 634 else: 635 stack[-1].append(t) 636 637 top = stack[0] 638 if len(stack) > 1: 639 for ls in stack[1:]: 640 top.extend(ls) 641 642 if len(top) == 1 and isinstance(top[0], Group): 643 top = top[0].set_boost(top.boost) 644 645 return top 646 647 class Open(Singleton): 648 expr = rcompile("\\(") 649 650 class Close(Singleton): 651 expr = rcompile("\\)") 652 653 654 class FieldsPlugin(Plugin): 655 """Adds the ability to specify the field of a clause using a colon. 656 657 This plugin is included in the default parser configuration. 658 """ 659 660 def tokens(self, parser): 661 return ((FieldsPlugin.Field, 0), ) 662 663 def filters(self, parser): 664 return ((FieldsPlugin.do_fieldnames, 100), ) 665 666 @staticmethod 667 def do_fieldnames(parser, stream): 668 newstream = stream.empty() 669 newname = None 670 for i, t in enumerate(stream): 671 if isinstance(t, FieldsPlugin.Field): 672 valid = False 673 if i < len(stream) - 1: 674 next = stream[i+1] 675 if not isinstance(next, (White, FieldsPlugin.Field)): 676 newname = t.fieldname 677 valid = True 678 if not valid: 679 newstream.append(Word(t.fieldname, fieldname=parser.fieldname)) 680 continue 681 682 if isinstance(t, Group): 683 t = FieldsPlugin.do_fieldnames(parser, t) 684 685 if newname is not None: 686 t = t.set_fieldname(newname) 687 newstream.append(t) 688 newname = None 689 690 return newstream 691 692 class Field(Token): 693 expr = rcompile(u"(\w[\w\d]*):") 694 695 def __init__(self, fieldname): 696 self.fieldname = fieldname 697 698 def __repr__(self): 699 return "<%s:>" % self.fieldname 700 701 def set_fieldname(self, fieldname): 702 return self.__class__(fieldname) 703 704 @classmethod 705 def create(cls, parser, match): 706 fieldname = match.group(1) 707 if not parser.schema or (fieldname in parser.schema): 708 return cls(fieldname) 709 710 711 class CompoundsPlugin(Plugin): 712 """Adds the ability to use AND, OR, ANDMAYBE, and ANDNOT to specify 713 query constraints. 714 715 You can customize the tokens by passing regular expressions to the ``And``, 716 ``Or``, ``AndNot``, and/or ``AndMaybe`` keywords to the class initializer:: 717 718 qp = qparser.QueryParser("content") 719 720 cp = qparser.CompoundsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~") 721 qp.replace_plugin(cp) 722 723 This plugin is included in the default parser configuration. 724 """ 725 726 def __init__(self, And=r"\sAND\s", Or=r"\sOR\s", AndNot=r"\sANDNOT\s", 727 AndMaybe=r"\sANDMAYBE\s"): 728 # Create one-off token classes using the keyword arguments 729 class AndTokenClass(Singleton): 730 expr = rcompile(And) 731 class OrTokenClass(Singleton): 732 expr = rcompile(Or) 733 class AndNotTokenClass(Singleton): 734 expr = rcompile(AndNot) 735 class AndMaybeTokenClass(Singleton): 736 expr = rcompile(AndMaybe) 737 738 # Store these classes as attributes 739 self.And = AndTokenClass 740 self.Or = OrTokenClass 741 self.AndNot = AndNotTokenClass 742 self.AndMaybe = AndMaybeTokenClass 743 744 def tokens(self, parser): 745 return ((self.AndNot, -10), (self.AndMaybe, -5), (self.And, 0), 746 (self.Or, 0)) 747 748 def filters(self, parser): 749 return ((self.do_compounds, 600), ) 750 751 def do_compounds(self, parser, stream): 752 newstream = stream.empty() 753 i = 0 754 while i < len(stream): 755 # The current token 756 t = stream[i] 757 758 # Whether this token has other tokens in front and behind; that is, 759 # if ismiddle is True, this is not the first or last token 760 ismiddle = newstream and i < len(stream) - 1 761 762 if isinstance(t, Group): 763 # The current token is a group: recursively apply this plugin 764 # to the group 765 newstream.append(self.do_compounds(parser, t)) 766 767 elif isinstance(t, (self.And, self.Or)): 768 # This is either an And or Or token. Create a new Group class 769 # of the appropriate type 770 if isinstance(t, self.And): 771 cls = AndGroup 772 else: 773 cls = OrGroup 774 775 if cls != type(newstream) and ismiddle: 776 last = newstream.pop() 777 rest = self.do_compounds(parser, cls(stream[i+1:])) 778 newstream.append(cls([last, rest])) 779 break 780 781 elif isinstance(t, (self.AndNot, self.AndMaybe)) and ismiddle: 782 # This is either an AndNot or AndMaybe token. Create a new 783 # Group class of the appropriate type 784 if isinstance(t, self.AndNot): 785 cls = AndNotGroup 786 else: 787 cls = AndMaybeGroup 788 789 last = newstream.pop() 790 i += 1 791 next = stream[i] 792 if isinstance(next, Group): 793 next = self.do_compounds(parser, next) 794 newstream.append(cls([last, next])) 795 796 else: 797 newstream.append(t) 798 799 i += 1 800 801 return newstream 802 803 804 class BoostPlugin(Plugin): 805 """Adds the ability to boost clauses of the query using the circumflex. 806 807 This plugin is included in the default parser configuration. 808 """ 809 810 def tokens(self, parser): 811 return ((BoostPlugin.Boost, 0), ) 812 813 def filters(self, parser): 814 return ((BoostPlugin.clean_boost, 0), (BoostPlugin.do_boost, 700)) 815 816 @staticmethod 817 def clean_boost(parser, stream): 818 newstream = stream.empty() 819 for i, t in enumerate(stream): 820 if isinstance(t, BoostPlugin.Boost): 821 if i == 0 or isinstance(stream[i-1], (BoostPlugin.Boost, White)): 822 t = Word(t.original) 823 newstream.append(t) 824 return newstream 825 826 @staticmethod 827 def do_boost(parser, stream): 828 newstream = stream.empty() 829 830 for t in stream: 831 if isinstance(t, Group): 832 newstream.append(BoostPlugin.do_boost(parser, t)) 833 834 elif isinstance(t, BoostPlugin.Boost): 835 if newstream: 836 newstream.append(newstream.pop().set_boost(t.boost)) 837 838 else: 839 newstream.append(t) 840 841 return newstream 842 843 class Boost(Token): 844 expr = rcompile("\\^([0-9]+(.[0-9]+)?)($|(?=[ \t\r\n]))") 845 846 def __init__(self, original, boost): 847 self.original = original 848 self.boost = boost 849 850 def __repr__(self): 851 return "<^%s>" % self.boost 852 853 @classmethod 854 def create(cls, parser, match): 855 try: 856 return cls(match.group(0), float(match.group(1))) 857 except ValueError: 858 return Word(match.group(0)) 859 860 861 class NotPlugin(Plugin): 862 """Adds the ability to negate a clause by preceding it with NOT. 863 864 You can customize the token by passing a regular expression to the class 865 initializer:: 866 867 qp = qparser.QueryParser("content") 868 869 # Use - as the not token 870 qp.replace_plugin(qparser.NotPlugin("(^|(?<= ))-")) 871 872 # Use ! as the not token 873 qp.replace_plugin(qparser.NotPlugin("(^|(?<= ))!")) 874 875 This plugin is included in the default parser configuration. 876 """ 877 878 def __init__(self, token="(^|(?<= ))NOT "): 879 class Not(Singleton): 880 expr = rcompile(token) 881 882 self.Not = Not 883 884 def tokens(self, parser): 885 return ((self.Not, 0), ) 886 887 def filters(self, parser): 888 return ((self.do_not, 800), ) 889 890 def do_not(self, parser, stream): 891 newstream = stream.empty() 892 notnext = False 893 for t in stream: 894 if isinstance(t, self.Not): 895 notnext = True 896 continue 897 898 if isinstance(t, Group): 899 t = self.do_not(parser, t) 900 901 if notnext: 902 t = NotGroup([t]) 903 904 newstream.append(t) 905 notnext = False 906 907 return newstream 908 909 910 class PlusMinusPlugin(Plugin): 911 """Adds the ability to use + and - in a flat OR query to specify required 912 and prohibited terms. 913 914 This is the basis for the parser configuration returned by 915 ``SimpleParser()``. 916 """ 917 918 def tokens(self, parser): 919 return ((PlusMinusPlugin.Plus, 0), (PlusMinusPlugin.Minus, 0)) 920 921 def filters(self, parser): 922 return ((PlusMinusPlugin.do_plusminus, 510), ) 923 924 @staticmethod 925 def do_plusminus(parser, stream): 926 required = AndGroup() 927 optional = OrGroup() 928 prohibited = OrGroup() 929 930 nextlist = optional 931 for t in stream: 932 if isinstance(t, PlusMinusPlugin.Plus): 933 nextlist = required 934 elif isinstance(t, PlusMinusPlugin.Minus): 935 nextlist = prohibited 936 else: 937 nextlist.append(t) 938 nextlist = optional 939 940 r = optional 941 if required: 942 r = AndMaybeGroup([required, optional]) 943 if prohibited: 944 r = AndNotGroup([r, prohibited]) 945 return r 946 947 class Plus(Singleton): 948 expr = rcompile("\\+") 949 950 class Minus(Singleton): 951 expr = rcompile("-") 952 953 954 class MultifieldPlugin(Plugin): 955 """Converts any unfielded terms into OR clauses that search for the 956 term in a specified list of fields. 957 """ 958 959 def __init__(self, fieldnames, fieldboosts=None): 960 """ 961 :param fieldnames: a list of fields to search. 962 :param fieldboosts: an optional dictionary mapping field names to 963 a boost to use for that field. 964 """ 965 966 self.fieldnames = fieldnames 967 self.boosts = fieldboosts or {} 968 969 def filters(self, parser): 970 return ((self.do_multifield, 110), ) 971 972 def do_multifield(self, parser, stream): 973 newstream = stream.empty() 974 for t in stream: 975 if isinstance(t, BasicSyntax) and t.fieldname is None: 976 t = OrGroup([t.set_fieldname(fn).set_boost(self.boosts.get(fn, 1.0)) 977 for fn in self.fieldnames]) 978 newstream.append(t) 979 return newstream 980 981 982 class DisMaxPlugin(Plugin): 983 """Converts any unfielded terms into DisjunctionMax clauses that search 984 for the term in a specified list of fields. 985 """ 986 987 def __init__(self, fieldboosts, tiebreak=0.0): 988 """ 989 :param fieldboosts: a dictionary mapping field names to a boost to use 990 for that in the DisjuctionMax query. 991 """ 992 993 self.fieldboosts = fieldboosts.items() 994 self.tiebreak = tiebreak 995 996 def filters(self, parser): 997 return ((self.do_dismax, 110), ) 998 999 def do_dismax(self, parser, stream): 1000 newstream = stream.empty() 1001 for t in stream: 1002 if isinstance(t, BasicSyntax) and t.fieldname is None: 1003 t = DisMaxGroup([t.set_fieldname(fn).set_boost(b) 1004 for fn, b in self.fieldboosts], 1005 tiebreak=self.tiebreak) 1006 newstream.append(t) 1007 return newstream 1008 1009 1010 class FieldAliasPlugin(Plugin): 1011 """Adds the ability to use "aliases" of fields in the query string. 1012 1013 >>> # Allow users to use 'body' or 'text' to refer to the 'content' field 1014 >>> parser.add_plugin(FieldAliasPlugin({"content": ("body", "text")})) 1015 >>> parser.parse("text:hello") 1016 Term("content", "hello") 1017 """ 1018 1019 def __init__(self, fieldmap): 1020 """ 1021 :param fieldmap: a dictionary mapping fieldnames to a list of 1022 aliases for the field. 1023 """ 1024 1025 self.fieldmap = fieldmap 1026 self.reverse = {} 1027 for key, values in fieldmap.iteritems(): 1028 for value in values: 1029 self.reverse[value] = key 1030 1031 def filters(self, parser): 1032 return ((self.do_aliases, 90), ) 1033 1034 def do_aliases(self, parser, stream): 1035 newstream = stream.empty() 1036 for t in stream: 1037 if (not isinstance(t, Group) 1038 and t.fieldname is not None 1039 and t.fieldname in self.reverse): 1040 t = t.set_fieldname(self.reverse[t.fieldname]) 1041 newstream.append(t) 1042 return newstream 1043 1044 1045 # Parser object 1046 1047 full_profile = (BoostPlugin, CompoundsPlugin, FieldsPlugin, GroupPlugin, 1048 NotPlugin, PhrasePlugin, RangePlugin, SingleQuotesPlugin, 1049 WildcardPlugin) 31 full_profile = (BoostPlugin, OperatorsPlugin, FieldsPlugin, GroupPlugin, 32 PhrasePlugin, RangePlugin, SingleQuotesPlugin, WildcardPlugin) 1050 33 1051 34 … … 1065 48 And([Term("content", u"hello"), Term("content", u"there")]) 1066 49 """ 50 51 _multitoken_query_map = {"and": query.And, "or": query.Or, 52 "phrase": query.Phrase} 1067 53 1068 54 def __init__(self, fieldname, schema=None, termclass=query.Term, … … 1152 138 return [item for item, pri in items_and_priorities] 1153 139 140 def multitoken_query(self, name, texts, fieldname, termclass, boost): 141 qclass = self._multitoken_query_map.get(name.lower()) 142 if qclass: 143 return qclass([termclass(fieldname, t, boost=boost) 144 for t in texts]) 145 146 def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True, 147 removestops=True): 148 """Returns the appropriate query object for a single term in the query 149 string. 150 """ 151 152 if self.schema and fieldname in self.schema: 153 field = self.schema[fieldname] 154 155 # If this field type wants to parse queries itself, let it do so 156 # and return early 157 if field.self_parsing(): 158 try: 159 return field.parse_query(fieldname, text, boost=boost) 160 except QueryParserError: 161 return query.NullQuery 162 163 # Otherwise, ask the field to process the text into a list of 164 # tokenized strings 165 texts = list(field.process_text(text, mode="query", 166 tokenize=tokenize, 167 removestops=removestops)) 168 169 # If the analyzer returned more than one token, use the field's 170 # multitoken_query attribute to decide what query class, if any, to 171 # use to put the tokens together 172 if len(texts) > 1: 173 mtq = self.multitoken_query(field.multitoken_query, texts, 174 fieldname, termclass, boost) 175 if mtq: 176 return mtq 177 178 # It's possible field.process_text() will return an empty list (for 179 # example, on a stop word) 180 if not texts: 181 return query.NullQuery 182 183 text = texts[0] 184 185 return termclass(fieldname, text, boost=boost) 186 1154 187 def tokens(self): 1155 188 """Returns a priorized list of tokens from the included plugins. -
branches/mbutscher/work/lib/whoosh/query.py
r230 r234 752 752 753 753 754 _wildcard_exp = re.compile("(.*?)([?*]|$)");755 754 class Wildcard(MultiTerm): 756 755 """Matches documents that contain any terms that match a wildcard … … 862 861 """ 863 862 864 if not text:865 raise QueryError("Fuzzy term is empty")866 867 863 self.fieldname = fieldname 868 864 self.text = text … … 881 877 882 878 def __repr__(self): 883 r eturn "%s(%r, %r, ratio=%f)" % (self.__class__.__name__,884 self.fieldname, self.text,885 self.ratio)879 r = "%s(%r, %r, boost=%f, minsimilarity=%f, prefixlength=%d)" 880 return r % (self.__class__.__name__, self.fieldname, self.text, 881 self.boost, self.minsimilarity, self.prefixlength) 886 882 887 883 def __unicode__(self): 888 return u"~" + self.text 884 r = u"~" + self.text 885 if self.boost != 1.0: 886 r += "^%f" % self.boost 887 return r 889 888 890 889 def copy(self): … … 982 981 983 982 def replace(self, oldtext, newtext): 984 if self.start == oldtext: 985 return TermRange(self.fieldname, newtext, self.end, 986 self.startexcl, self.endexcl, boost=self.boost) 987 elif self.end == oldtext: 988 return TermRange(self.fieldname, self.start, newtext, 989 self.startexcl, self.endexcl, boost=self.boost) 990 else: 991 return self 992 983 start = newtext if self.start == oldtext else self.start 984 end = newtext if self.end == oldtext else self.end 985 return self.__class__(self.fieldname, start, end, self.startexcl, 986 self.endexcl, boost=self.boost) 987 993 988 def _words(self, ixreader): 994 989 fieldname = self.fieldname … … 1206 1201 1207 1202 1208 class Phrase( MultiTerm):1203 class Phrase(Query): 1209 1204 """Matches documents containing a given phrase.""" 1210 1205 … … 1309 1304 1310 1305 1306 class Ordered(And): 1307 """Matches documents containing a list of sub-queries in the given order. 1308 """ 1309 1310 JOINT = " BEFORE " 1311 1312 def matcher(self, searcher, exclude_docs=None): 1313 from spans import SpanBefore 1314 return self._matcher(SpanBefore.SpanBeforeMatcher, searcher, 1315 exclude_docs=exclude_docs) 1316 1317 1311 1318 class Every(Query): 1312 1319 """A query that matches every document containing any word in a given … … 1340 1347 def matcher(self, searcher, exclude_docs=None): 1341 1348 fieldname = self.fieldname 1342 s = set()1343 1349 1344 1350 # This is a hacky hack, but just create an in-memory set of all the 1345 1351 # document numbers of every term in the field 1346 for text in searcher.lexicon(fieldname): 1347 pr = searcher.postings(fieldname, text) 1348 s.update(pr.all_ids()) 1352 s = set() 1353 1354 if fieldname == "*": 1355 s.update(xrange(searcher.doc_count_all())) 1356 else: 1357 for text in searcher.lexicon(fieldname): 1358 pr = searcher.postings(fieldname, text) 1359 s.update(pr.all_ids()) 1360 1349 1361 if exclude_docs: 1350 1362 s.difference_update(exclude_docs) 1351 1363 1352 return ListMatcher(sorted(s), weights=[self.boost] * len(s))1364 return ListMatcher(sorted(s), all_weights=self.boost) 1353 1365 1354 1366 … … 1394 1406 else: 1395 1407 ids = array("I", m.all_ids()) 1396 return ListMatcher(ids, weights=[self.score] * len(ids))1408 return ListMatcher(ids, all_weights=self.score) 1397 1409 1398 1410 def replace(self, oldtext, newtext): … … 1451 1463 1452 1464 1453 class Require(CompoundQuery): 1465 class BinaryQuery(CompoundQuery): 1466 """Base class for binary queries (queries which are composed of two 1467 sub-queries). Subclasses should set the ``matcherclass`` attribute or 1468 override ``matcher()``, and may also need to override ``normalize()``, 1469 ``estimate_size()``, and/or ``estimate_min_size()``. 1470 """ 1471 1472 def __init__(self, a, b, boost=1.0): 1473 self.a = a 1474 self.b = b 1475 self.subqueries = (a, b) 1476 self.boost = boost 1477 1478 def copy(self): 1479 return self.__class__(self.a, self.b, boost=self.boost) 1480 1481 def normalize(self): 1482 a = self.a.normalize() 1483 b = self.b.normalize() 1484 if a is NullQuery and b is NullQuery: 1485 return NullQuery 1486 elif a is NullQuery: 1487 return b 1488 elif b is NullQuery: 1489 return a 1490 1491 return self.__class__(a, b, boost=self.boost) 1492 1493 def matcher(self, searcher, exclude_docs=None): 1494 return self.matcherclass(self.a.matcher(searcher, exclude_docs=exclude_docs), 1495 self.b.matcher(searcher, exclude_docs=exclude_docs)) 1496 1497 1498 class Require(BinaryQuery): 1454 1499 """Binary query returns results from the first query that also appear in 1455 1500 the second query, but only uses the scores from the first query. This lets … … 1458 1503 1459 1504 JOINT = " REQUIRE " 1460 1461 def __init__(self, scoredquery, requiredquery, boost=1.0): 1462 """ 1463 :param scoredquery: The query that is scored. Only documents that also 1464 appear in the second query ('requiredquery') are scored. 1465 :param requiredquery: Only documents that match both 'scoredquery' and 1466 'requiredquery' are returned, but this query does not 1467 contribute to the scoring. 1468 """ 1469 1470 # The superclass CompoundQuery expects the subqueries to be in a 1471 # sequence in self.subqueries 1472 self.subqueries = (scoredquery, requiredquery) 1473 self.boost = boost 1474 1475 def copy(self): 1476 return self.__class__(self.subqueries[0], self.subqueries[1], 1505 matcherclass = RequireMatcher 1506 1507 def estimate_size(self, ixreader): 1508 return self.b.estimate_size(ixreader) 1509 1510 def estimate_min_size(self, ixreader): 1511 return self.b.estimate_min_size(ixreader) 1512 1513 def normalize(self): 1514 a = self.a.normalize() 1515 b = self.b.normalize() 1516 if a is NullQuery or b is NullQuery: 1517 return NullQuery 1518 return self.__class__(a, b, boost=self.boost) 1519 1520 def replace(self, oldtext, newtext): 1521 return self.__class__(self.a.replace(oldtext, newtext), 1522 self.b.replace(oldtext, newtext), 1477 1523 boost=self.boost) 1478 1479 def estimate_size(self, ixreader): 1480 return self.subqueries[1].estimate_size(ixreader) 1481 1482 def estimate_min_size(self, ixreader): 1483 return self.subqueries[1].estimate_min_size(ixreader) 1484 1485 def normalize(self): 1486 subqueries = [q.normalize() for q in self.subqueries] 1487 if NullQuery in subqueries: 1488 return NullQuery 1489 return Require(subqueries[0], subqueries[1], boost=self.boost) 1490 1524 1491 1525 def docs(self, searcher, exclude_docs=None): 1492 1526 return And(self.subqueries).docs(searcher, exclude_docs=exclude_docs) 1493 1527 1494 def matcher(self, searcher, exclude_docs=None): 1495 scored, required = self.subqueries 1496 return RequireMatcher(scored.matcher(searcher, exclude_docs=exclude_docs), 1497 required.matcher(searcher, exclude_docs=exclude_docs)) 1498 1499 1500 class AndMaybe(CompoundQuery): 1528 1529 class AndMaybe(BinaryQuery): 1501 1530 """Binary query takes results from the first query. If and only if the 1502 1531 same document also appears in the results from the second query, the score … … 1505 1534 1506 1535 JOINT = " ANDMAYBE " 1507 1508 def __init__(self, requiredquery, optionalquery, boost=1.0): 1509 """ 1510 :param requiredquery: Documents matching this query are returned. 1511 :param optionalquery: If a document matches this query as well as 1512 'requiredquery', the score from this query is added to the 1513 document score from 'requiredquery'. 1514 """ 1515 1516 # The superclass CompoundQuery expects the subqueries to be 1517 # in a sequence in self.subqueries 1518 self.subqueries = (requiredquery, optionalquery) 1519 self.boost = boost 1520 1521 def copy(self): 1522 return self.__class__(self.subqueries[0], self.subqueries[1], 1523 boost=self.boost) 1536 matcherclass = AndMaybeMatcher 1524 1537 1525 1538 def normalize(self): 1526 required, optional = (q.normalize() for q in self.subqueries) 1527 if required is NullQuery: 1539 a = self.a.normalize() 1540 b = self.b.normalize() 1541 if a is NullQuery: 1528 1542 return NullQuery 1529 if optionalis NullQuery:1530 return required1531 return AndMaybe(required, optional, boost=self.boost)1543 if b is NullQuery: 1544 return a 1545 return self.__class__(a, b, boost=self.boost) 1532 1546 1533 1547 def estimate_min_size(self, ixreader): … … 1536 1550 def docs(self, searcher, exclude_docs=None): 1537 1551 return self.subqueries[0].docs(searcher, exclude_docs=exclude_docs) 1538 1539 def matcher(self, searcher, exclude_docs=None): 1540 required, optional = self.subqueries 1541 return AndMaybeMatcher(required.matcher(searcher, exclude_docs=exclude_docs), 1542 optional.matcher(searcher, exclude_docs=exclude_docs)) 1543 1544 1545 class AndNot(Query): 1552 1553 1554 class AndNot(BinaryQuery): 1546 1555 """Binary boolean query of the form 'a ANDNOT b', where documents that 1547 1556 match b are removed from the matches for a. 1548 1557 """ 1549 1558 1550 def __init__(self, positive, negative, boost=1.0): 1551 """ 1552 :param positive: query to INCLUDE. 1553 :param negative: query whose matches should be EXCLUDED. 1554 :param boost: boost factor that should be applied to the raw score of 1555 results matched by this query. 1556 """ 1557 1558 self.positive = positive 1559 self.negative = negative 1560 self.boost = boost 1561 1562 def __eq__(self, other): 1563 return (other 1564 and self.__class__ is other.__class__ 1565 and self.positive == other.positive 1566 and self.negative == other.negative 1567 and self.boost == other.boost) 1568 1569 def __repr__(self): 1570 return "%s(%r, %r)" % (self.__class__.__name__, 1571 self.positive, self.negative) 1572 1573 def __unicode__(self): 1574 return u"%s ANDNOT %s" % (self.positive, self.negative) 1575 1576 def copy(self): 1577 return self.__class__(self.positive, self.negative, boost=self.boost) 1559 JOINT = " ANDNOT " 1578 1560 1579 1561 def normalize(self): 1580 pos = self.positive.normalize()1581 neg = self.negative.normalize()1582 1583 if posis NullQuery:1562 a = self.a.normalize() 1563 b = self.b.normalize() 1564 1565 if a is NullQuery: 1584 1566 return NullQuery 1585 elif neg is NullQuery: 1586 return pos 1587 1588 return AndNot(pos, neg, boost=self.boost) 1589 1590 def replace(self, oldtext, newtext): 1591 return AndNot(self.positive.replace(oldtext, newtext), 1592 self.negative.replace(oldtext, newtext), 1593 boost=self.boost) 1567 elif b is NullQuery: 1568 return a 1569 1570 return self.__class__(a, b, boost=self.boost) 1594 1571 1595 1572 def _all_terms(self, termset, phrases=True): 1596 self. positive.all_terms(termset, phrases=phrases)1573 self.a.all_terms(termset, phrases=phrases) 1597 1574 1598 1575 def _existing_terms(self, ixreader, termset, reverse=False, phrases=True): 1599 self.positive.existing_terms(ixreader, termset, reverse=reverse, 1600 phrases=phrases) 1601 1602 def matcher(self, searcher, exclude_docs=None): 1603 notvector = _not_vector(searcher, [self.negative], exclude_docs) 1604 return self.positive.matcher(searcher, exclude_docs=notvector) 1576 self.a.existing_terms(ixreader, termset, reverse=reverse, 1577 phrases=phrases) 1578 1579 def matcher(self, searcher, exclude_docs=None): 1580 # This is faster than actually using an AndNotMatcher, but could use 1581 # a lot of memory on a very large index. 1582 # TODO: Switch based on size of index? 1583 notvector = _not_vector(searcher, [self.b], exclude_docs) 1584 return self.a.matcher(searcher, exclude_docs=notvector) 1585 1586 1587 class Otherwise(BinaryQuery): 1588 """A binary query that only matches the second clause if the first clause 1589 doesn't match any documents. 1590 """ 1591 1592 JOINT = " OTHERWISE " 1593 1594 def matcher(self, searcher, exclude_docs=None): 1595 m = self.a.matcher(searcher, exclude_docs=exclude_docs) 1596 if not m.is_active(): 1597 m = self.b.matcher(searcher, exclude_docs=exclude_docs) 1598 return m 1605 1599 1606 1600 … … 1609 1603 1610 1604 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 -
branches/mbutscher/work/lib/whoosh/reading.py
r230 r234 591 591 postreaders = [] 592 592 docoffsets = [] 593 excl = exclude_docs 594 term = (fieldname, text) 595 593 596 for i, r in enumerate(self.readers): 594 try: 597 if term in r: 598 offset = self.doc_offsets[i] 599 600 # If an exclude_docs set was passed in, we need to pull out 601 # the document numbers that apply to this reader and subtract 602 # the offset from them 603 if exclude_docs and i > 0: 604 limit = offset + r.doc_count_all() 605 # Create a subset of the exclude_docs set with the offset 606 # subtracted 607 excl = set(docnum - offset for docnum in exclude_docs 608 if docnum >= offset and docnum < limit) 609 610 # Get a posting reader for the term and add it to the list 595 611 pr = r.postings(fieldname, text, scorer=scorer, 596 exclude_docs=excl ude_docs)612 exclude_docs=excl) 597 613 postreaders.append(pr) 598 docoffsets.append(self.doc_offsets[i]) 599 except TermNotFound: 600 pass 614 docoffsets.append(offset) 601 615 602 616 if not postreaders: -
branches/mbutscher/work/lib/whoosh/scoring.py
r231 r234 479 479 cache[docid] = i 480 480 481 self.limit = i482 481 self._fieldcache = cache 483 482 return cache -
branches/mbutscher/work/lib/whoosh/searching.py
r230 r234 872 872 """Represents a single search result ("hit") in a Results object. 873 873 874 This object acts like a dictionary of the matching document's stored 875 fields. If for some reason you need an actual ``dict`` object, use 876 ``Hit.fields()`` to get one. 877 874 878 >>> r = searcher.search(query.Term("content", "render")) 875 879 >>> r[0] 876 880 <Hit {title=u"Rendering the scene"}> 881 >>> r[0].rank 882 0 877 883 >>> r[0].docnum 878 884 4592L 879 885 >>> r[0].score 880 2.52045682 886 2.52045682 887 >>> r[0]["title"] 888 "Rendering the scene" 889 >>> r[0].keys() 890 ["title"] 881 891 """ 882 892 … … 891 901 892 902 self.searcher = searcher 893 self.pos = pos903 self.pos = self.rank = pos 894 904 self.docnum = docnum 895 905 self.score = score 896 906 self._fields = None 907 908 def fields(self): 909 """Returns a dictionary of the stored fields of the document this 910 object represents. 911 """ 912 913 if self._fields is None: 914 self._fields = self.searcher.stored_fields(self.docnum) 915 return self._fields 897 916 898 917 def __repr__(self): … … 907 926 return False 908 927 909 def __iter__(self): 910 return self.fields().iterkeys() 911 912 def __getitem__(self, key): 913 return self.fields().__getitem__(key) 914 915 def __len__(self): 916 return len(self.fields()) 917 918 def fields(self): 919 if self._fields is None: 920 self._fields = self.searcher.stored_fields(self.docnum) 921 return self._fields 922 923 def get(self, key, default=None): 924 return self.fields().get(key, default) 928 def __len__(self): return len(self.fields()) 929 def __iter__(self): return self.fields().iterkeys() 930 def __getitem__(self, key): return self.fields().__getitem__(key) 931 def __contains__(self, key): return key in self.fields() 932 def items(self): return self.fields().items() 933 def keys(self): return self.fields().keys() 934 def values(self): return self.fields().values() 935 def iteritems(self): return self.fields().iteritems() 936 def iterkeys(self): return self.fields().iterkeys() 937 def itervalues(self): return self.fields().itervalues() 938 def get(self, key, default=None): return self.fields().get(key, default) 939 940 def __setitem__(self, key, value): 941 raise NotImplementedError("You cannot modify a search result") 942 def __delitem__(self, key, value): 943 raise NotImplementedError("You cannot modify a search result") 944 def clear(self): 945 raise NotImplementedError("You cannot modify a search result") 946 def update(self, dict=None, **kwargs): 947 raise NotImplementedError("You cannot modify a search result") 925 948 926 949 -
branches/mbutscher/work/lib/whoosh/support/bench.py
r230 r234 18 18 import os.path, random, sys 19 19 from optparse import OptionParser 20 from shutil import rmtree 20 21 from zlib import compress, decompress 21 22 … … 36 37 pass 37 38 39 try: 40 from persistent import Persistent 41 class ZDoc(Persistent): 42 def __init__(self, d): 43 self.__dict__.update(d) 44 except ImportError: 45 pass 46 47 48 class Module(object): 49 def __init__(self, bench, options, args): 50 self.bench = bench 51 self.options = options 52 self.args = args 53 54 def __repr__(self): 55 return self.__class__.__name__ 56 57 def indexer(self): 58 pass 59 60 def index_document(self, d): 61 raise NotImplementedError 62 63 def finish(self): 64 pass 65 66 def searcher(self): 67 pass 68 69 def query(self): 70 raise NotImplementedError 71 72 def find(self, q): 73 raise NotImplementedError 74 75 def findterms(self, terms): 76 raise NotImplementedError 77 78 def results(self, r): 79 return r 80 81 82 class Spec(object): 83 headline_field = "title" 84 main_field = "body" 85 whoosh_compress_main = False 86 87 def __init__(self, options, args): 88 self.options = options 89 self.args = args 90 91 def documents(self): 92 raise NotImplementedError 93 94 def setup(self): 95 pass 96 97 def print_results(self, ls): 98 showbody = self.options.showbody 99 limit = self.options.limit 100 for i, hit in enumerate(ls): 101 if i >= limit: 102 break 103 104 print "%d. %s" % (i+1, hit.get(self.headline_field)) 105 if showbody: 106 print hit.get(self.main_field) 107 108 class WhooshModule(Module): 109 def indexer(self): 110 schema = self.bench.spec.whoosh_schema() 111 path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) 112 if not os.path.exists(path): 113 os.mkdir(path) 114 ix = index.create_in(path, schema) 115 self.writer = ix.writer(procs=int(self.options.procs), 116 limitmb=int(self.options.limitmb)) 117 118 def index_document(self, d): 119 if hasattr(self.bench, "process_document_whoosh"): 120 self.bench.process_document_whoosh(d) 121 if self.bench.spec.whoosh_compress_main: 122 mf = self.bench.spec.main_field 123 d["_stored_%s" % mf] = compress(d[mf], 9) 124 self.writer.add_document(**d) 125 126 def finish(self): 127 self.writer.commit() 128 129 def searcher(self): 130 path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) 131 ix = index.open_dir(path) 132 self.srch = ix.searcher() 133 self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) 134 135 def query(self): 136 qstring = " ".join(self.args).decode("utf8") 137 return self.parser.parse(qstring) 138 139 def find(self, q): 140 return self.srch.search(q, limit=int(self.options.limit)) 141 142 def results(self, r): 143 mf = self.bench.spec.main_field 144 for hit in r: 145 fs = hit.fields() 146 if self.bench.spec.whoosh_compress_main: 147 fs[mf] = decompress(fs[mf]) 148 yield fs 149 150 def findterms(self, terms): 151 limit = int(self.options.limit) 152 s = self.srch 153 q = query.Term(self.main_field, None) 154 for term in terms: 155 q.text = term 156 yield s.search(q, limit=limit) 157 158 159 class XappyModule(Module): 160 def indexer(self): 161 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) 162 conn = self.bench.spec.xappy_connection(path) 163 return conn 164 165 def index_document(self, conn, d): 166 if hasattr(self.bench, "process_document_xappy"): 167 self.bench.process_document_xappy(d) 168 doc = xappy.UnprocessedDocument() 169 for key, values in d: 170 if not isinstance(values, list): 171 values = [values] 172 for value in values: 173 doc.fields.append(xappy.Field(key, value)) 174 conn.add(doc) 175 176 def finish(self, conn): 177 conn.flush() 178 179 def searcher(self): 180 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) 181 return xappy.SearchConnection(path) 182 183 def query(self, conn): 184 return conn.query_parse(" ".join(self.args)) 185 186 def find(self, conn, q): 187 return conn.search(q, 0, int(self.options.limit)) 188 189 def findterms(self, conn, terms): 190 limit = int(self.options.limit) 191 for term in terms: 192 q = conn.query_field(self.main_field, term) 193 yield conn.search(q, 0, limit) 194 195 def results(self, r): 196 hf = self.bench.spec.headline_field 197 mf = self.bench.spec.main_field 198 for hit in r: 199 yield {hf: hit.data[hf], mf: hit.data[mf]} 200 201 202 class XapianModule(Module): 203 def indexer(self): 204 path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname) 205 self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) 206 self.ixer = xapian.TermGenerator() 207 208 def index_document(self, d): 209 if hasattr(self.bench, "process_document_xapian"): 210 self.bench.process_document_xapian(d) 211 doc = xapian.Document() 212 doc.add_value(0, d.get(self.bench.spec.headline_field, "-")) 213 doc.set_data(d[self.main_field]) 214 self.ixer.set_document(doc) 215 self.ixer.index_text(d[self.main_field]) 216 self.database.add_document(doc) 217 218 def finish(self): 219 self.database.flush() 220 221 def searcher(self): 222 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) 223 self.db = xapian.Database(path) 224 self.enq = xapian.Enquire(self.db) 225 self.qp = xapian.QueryParser() 226 self.qp.set_database(self.db) 227 228 def query(self): 229 return self.qp.parse_query(" ".join(self.args)) 230 231 def find(self, q): 232 self.enq.set_query(q) 233 return self.enq.get_mset(0, int(self.options.limit)) 234 235 def findterms(self, terms): 236 limit = int(self.options.limit) 237 for term in terms: 238 q = self.qp.parse_query(term) 239 self.enq.set_query(q) 240 yield self.enq.get_mset(0, limit) 241 242 def results(self, matches): 243 hf = self.bench.spec.headline_field 244 mf = self.bench.spec.main_field 245 for m in matches: 246 yield {hf: m.document.get_value(0), mf: m.document.get_data()} 247 248 249 class SolrModule(Module): 250 def indexer(self): 251 self.solr_doclist = [] 252 self.conn = pysolr.Solr(self.options.url) 253 self.conn.delete("*:*") 254 self.conn.commit() 255 256 def index_document(self, d): 257 self.solr_doclist.append(d) 258 if len(self.solr_doclist) >= int(self.options.batch): 259 self.conn.add(self.solr_doclist, commit=False) 260 self.solr_doclist = [] 261 262 def finish(self): 263 if self.solr_doclist: 264 self.conn.add(self.solr_doclist) 265 del self.solr_doclist 266 self.conn.optimize(block=True) 267 268 def searcher(self): 269 self.solr = pysolr.Solr(self.options.url) 270 271 def query(self): 272 return " ".join(self.args) 273 274 def find(self, q): 275 return self.solr.search(q, limit=int(self.options.limit)) 276 277 def findterms(self, terms): 278 limit = int(self.options.limit) 279 for term in terms: 280 yield self.solr.search("body:" + term, limit=limit) 281 282 283 class ZcatalogModule(Module): 284 def indexer(self): 285 from ZODB.FileStorage import FileStorage 286 from ZODB.DB import DB 287 from zcatalog import catalog 288 from zcatalog import indexes 289 import transaction 290 291 dir = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname) 292 if os.path.exists(dir): 293 rmtree(dir) 294 os.mkdir(dir) 295 296 storage = FileStorage(os.path.join(dir, "index")) 297 db = DB(storage) 298 conn = db.open() 299 300 self.cat = catalog.Catalog() 301 self.bench.spec.zcatalog_setup(self.cat) 302 conn.root()["cat"] = self.cat 303 transaction.commit() 304 305 self.zcatalog_count = 0 306 307 def index_document(self, d): 308 if hasattr(self.bench, "process_document_zcatalog"): 309 self.bench.process_document_zcatalog(d) 310 doc = ZDoc(d) 311 self.cat.index_doc(doc) 312 self.zcatalog_count += 1 313 if self.zcatalog_count >= 100: 314 import transaction 315 transaction.commit() 316 self.zcatalog_count = 0 317 318 def finish(self): 319 import transaction 320 transaction.commit() 321 del self.zcatalog_count 322 323 def searcher(self): 324 from ZODB.FileStorage import FileStorage 325 from ZODB.DB import DB 326 from zcatalog import catalog 327 from zcatalog import indexes 328 import transaction 329 330 path = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname, "index") 331 storage = FileStorage(path) 332 db = DB(storage) 333 conn = db.open() 334 335 self.cat = conn.root()["cat"] 336 337 def query(self): 338 return " ".join(self.args) 339 340 def find(self, q): 341 return self.cat.searchResults(body=q) 342 343 def findterms(self, terms): 344 for term in terms: 345 yield self.cat.searchResults(body=term) 346 347 def results(self, r): 348 hf = self.bench.spec.headline_field 349 mf = self.bench.spec.main_field 350 for hit in r: 351 # Have to access the attributes for them to be retrieved 352 yield {hf: getattr(hit, hf), mf: getattr(hit, mf)} 38 353 39 354 40 355 class Bench(object): 41 solr_url = "http://localhost:8983/solr" 42 main_field = "text" 43 headline_field = "title" 44 45 libs = ("whoosh", "xappy", "xapian", "solr") 46 47 _name = "unknown" 48 49 def name(self): 50 return self._name 51 52 def process_document_whoosh(self, d): 53 pass 54 55 def process_document_xappy(self, d): 56 pass 57 58 def process_document_xapian(self, d): 59 pass 60 61 def process_document_solr(self, d): 62 pass 356 libs = {"whoosh": WhooshModule, "xappy": XappyModule, 357 "xapian": XapianModule, "solr": SolrModule, 358 "zcatalog": ZcatalogModule} 63 359 64 360 def index(self, lib): … … 73 369 74 370 starttime = chunkstarttime = now() 75 ix = getattr(self, "%s_indexer" % lib)() 76 index_document = getattr(self, "index_document_%s" % lib) 77 for d in self.documents(): 371 lib.indexer() 372 for d in self.spec.documents(): 78 373 skipc -= 1 79 374 if not skipc: 80 index_document(ix,d)375 lib.index_document(d) 81 376 count += 1 82 377 skipc = skip … … 91 386 spooltime = now() 92 387 print "Spool time:", spooltime - starttime 93 getattr(self, "finish_%s" % lib)(ix)388 lib.finish() 94 389 committime = now() 95 390 print "Commit time:", committime - spooltime 96 391 print "Total time to index", count, "documents:", committime - starttime 97 392 98 def whoosh_indexer(self):99 schema = self.whoosh_schema()100 path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)101 if not os.path.exists(path):102 os.mkdir(path)103 ix = index.create_in(path, schema)104 w = ix.writer(procs=int(self.options.procs),105 limitmb=int(self.options.limitmb))106 return w107 108 def index_document_whoosh(self, writer, d):109 self.process_document_whoosh(d)110 writer.add_document(**d)111 112 def finish_whoosh(self, writer):113 writer.commit()114 115 def xappy_indexer(self):116 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)117 conn = self.xappy_connection(path)118 return conn119 120 def index_document_xappy(self, conn, d):121 self.process_document_xappy(d)122 doc = xappy.UnprocessedDocument()123 for key, values in d:124 if not isinstance(values, list):125 values = [values]126 for value in values:127 doc.fields.append(xappy.Field(key, value))128 conn.add(doc)129 130 def finish_xappy(self, conn):131 conn.flush()132 133 def xapian_indexer(self):134 path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname)135 database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN)136 indexer = xapian.TermGenerator()137 138 return (database, indexer)139 140 def index_document_xapian(self, dix, d):141 self.process_document_xapian(d)142 database, indexer = dix143 doc = xapian.Document()144 doc.add_value(0, d.get(self.headline_field, "-"))145 doc.set_data(d[self.main_field])146 indexer.set_document(doc)147 indexer.index_text(d[self.main_field])148 database.add_document(doc)149 150 def finish_xapian(self, dix):151 dix[0].flush()152 153 def solr_indexer(self):154 self.solr_doclist = []155 conn = pysolr.Solr(self.options.url)156 conn.delete("*:*")157 conn.commit()158 return conn159 160 def index_document_solr(self, conn, d):161 self.solr_doclist.append(d)162 if len(self.solr_doclist) >= int(self.options.batch):163 conn.add(self.solr_doclist, commit=False)164 self.solr_doclist = []165 166 def finish_solr(self, conn):167 if self.solr_doclist:168 conn.add(self.solr_doclist)169 del self.solr_doclist170 conn.optimize(block=True)171 172 def whoosh_searcher(self):173 path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)174 ix = index.open_dir(path)175 searcher = ix.searcher()176 parser = qparser.QueryParser(self.main_field, schema=ix.schema)177 178 return (searcher, parser)179 180 def whoosh_query(self, s):181 qstring = " ".join(self.args).decode("utf8")182 return s[1].parse(qstring)183 184 def whoosh_find(self, s, q):185 return s[0].search(q, limit=int(self.options.limit))186 187 def whoosh_findterms(self, s, terms):188 limit = int(self.options.limit)189 searcher = s[0]190 q = query.Term(self.main_field, None)191 for term in terms:192 q.text = term193 yield searcher.search(q, limit=limit)194 195 def whoosh_results(self, s, r):196 showbody = self.options.showbody197 198 print "Runtime:", r.runtime199 for hit in r:200 print hit.get(self.headline_field)201 if showbody:202 print decompress(hit[self.main_field])203 204 def xappy_searcher(self):205 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)206 return xappy.SearchConnection(path)207 208 def xappy_query(self, conn):209 return conn.query_parse(" ".join(self.args))210 211 def xappy_find(self, conn, q):212 return conn.search(q, 0, int(self.options.limit))213 214 def xappy_findterms(self, conn, terms):215 limit = int(self.options.limit)216 for term in terms:217 q = conn.query_field(self.main_field, term)218 yield conn.search(q, 0, limit)219 220 def xappy_results(self, conn, r):221 showbody = self.options.showbody222 for hit in r:223 print hit.rank, hit.data[self.headline_field]224 if showbody:225 print hit.data[self.main_field]226 227 def xapian_searcher(self):228 path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname)229 db = xapian.Database(path)230 enq = xapian.Enquire(db)231 qp = xapian.QueryParser()232 qp.set_database(db)233 return db, enq, qp234 235 def xapian_query(self, s):236 return s[2].parse_query(" ".join(self.args))237 238 def xapian_find(self, s, q):239 enq = s[1]240 enq.set_query(q)241 return enq.get_mset(0, int(self.options.limit))242 243 def xapian_findterms(self, s, terms):244 limit = int(self.options.limit)245 db, enq, qp = s246 for term in terms:247 q = qp.parse_query(term)248 enq.set_query(q)249 yield enq.get_mset(0, limit)250 251 def xapian_results(self, s, matches):252 showbody = self.options.showbody253 for m in matches:254 print m.rank, repr(m.document.get_value(0))255 if showbody:256 print m.document.get_data()257 258 def solr_searcher(self):259 return pysolr.Solr(self.solr_url)260 261 def solr_query(self, solr):262 return " ".join(self.args)263 264 def solr_find(self, solr, q):265 return solr.search(q, limit=int(self.options.limit))266 267 def solr_findterms(self, solr, terms):268 limit = int(self.options.limit)269 for term in terms:270 yield solr.search("body:" + term, limit=limit)271 272 def solr_results(self, solr, r):273 showbody = self.options.showbody274 print len(r), "results"275 for hit in r:276 print hit.get(self.headline_field)277 if showbody:278 print hit[self.main_field]279 280 393 def search(self, lib): 281 s = getattr(self, "%s_searcher" % lib)() 394 lib.searcher() 395 282 396 t = now() 283 q = getattr(self, "%s_query" % lib)(s)397 q = lib.query() 284 398 print "Query:", q 285 r = getattr(self, "%s_find" % lib)(s,q)399 r = lib.find(q) 286 400 print "Search time:", now() - t 287 401 288 402 t = now() 289 getattr(self, "%s_results" % lib)(s, r)403 self.spec.print_results(lib.results(r)) 290 404 print "Print time:", now() - t 291 405 … … 296 410 297 411 print "Searching %d terms with %s" % (len(terms), lib) 298 s = getattr(self, "%s_searcher" % lib)()412 lib.searcher() 299 413 starttime = now() 300 for r in getattr(self, "%s_findterms" % lib)(s,terms):414 for r in lib.findterms(terms): 301 415 pass 302 416 searchtime = now() - starttime 303 417 print "Search time:", searchtime, "searches/s:", float(len(terms))/searchtime 304 418 305 def generate_search_file(self, lib): 306 if self.args: 307 f = open(self.args[0], "wb") 308 else: 309 f = sys.stdout 310 count = int(self.options.generate) 311 312 t = now() 313 s = self.whoosh_searcher()[0] 314 terms = list(s.lexicon(self.main_field)) 315 sample = random.sample(terms, count) 316 for term in sample: 317 if term.isalnum(): 318 f.write(term + "\n") 319 print now() - t 320 321 def _parser(self): 419 def _parser(self, name): 322 420 p = OptionParser() 323 421 p.add_option("-x", "--lib", dest="lib", … … 331 429 help="Index the documents.", default=False) 332 430 p.add_option("-n", "--name", dest="indexname", metavar="PREFIX", 333 help="Index name prefix.", default="%s_index" % self.name())431 help="Index name prefix.", default="%s_index" % name) 334 432 p.add_option("-U", "--url", dest="url", metavar="URL", 335 433 help="Solr URL", default="http://localhost:8983/solr") … … 363 461 return p 364 462 365 def run(self ):366 parser = self._parser( )463 def run(self, specclass): 464 parser = self._parser(specclass.name) 367 465 options, args = parser.parse_args() 368 466 self.options = options 369 467 self.args = args 370 468 371 lib = options.lib 372 if lib not in self.libs: 373 raise Exception("Unknown library: %r" % lib) 469 if options.lib not in self.libs: 470 raise Exception("Unknown library: %r" % options.lib) 471 lib = self.libs[options.lib](self, options, args) 472 473 self.spec = specclass(options, args) 374 474 375 475 if options.setup: 376 self.s etup()476 self.spec.setup() 377 477 378 478 action = self.search -
branches/mbutscher/work/lib/whoosh/support/unicode.py
r230 r234 249 249 return _names[i] 250 250 251 251 252 def blocknum(ch): 252 253 """Returns the unicode block number for ch, or None if ch has no block. … … 268 269 269 270 270 if __name__ == "__main__": 271 pass 272 273 274 275 276 271 272 273 274 275 -
branches/stable-2.0/extensions/GnuplotClBridge.py
r231 r234 117 117 # childIn, childOut, childErr = os.popen3(cmdline, "b") 118 118 popenObject = subprocess.Popen(cmdline, shell=True, 119 stderr=subprocess.PIPE) 119 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 120 stdin=subprocess.PIPE) 120 121 childErr = popenObject.stderr 121 122 123 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 124 # why this is necessary 125 popenObject.stdin.close() 126 popenObject.stdout.close() 127 122 128 if u"noerror" in [a.strip() for a in insToken.appendices]: 123 129 childErr.read() -
branches/stable-2.0/extensions/GraphvizClBridge.py
r231 r234 124 124 # childIn, childOut, childErr = os.popen3(cmdline, "b") 125 125 popenObject = subprocess.Popen(cmdline, shell=True, 126 stderr=subprocess.PIPE) 126 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 127 stdin=subprocess.PIPE) 127 128 childErr = popenObject.stderr 129 130 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 131 # why this is necessary 132 popenObject.stdin.close() 133 popenObject.stdout.close() 128 134 129 135 if u"noerror" in [a.strip() for a in insToken.appendices]: -
branches/stable-2.0/extensions/GraphvizStructureView.py
r231 r234 342 342 # childIn, childOut, childErr = os.popen3(cmdline, "b") 343 343 popenObject = subprocess.Popen(cmdline, shell=True, 344 stderr=subprocess.PIPE) 344 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 345 stdin=subprocess.PIPE) 345 346 childErr = popenObject.stderr 347 348 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 349 # why this is necessary 350 popenObject.stdin.close() 351 popenObject.stdout.close() 346 352 347 353 if u"noerror" in [a.strip() for a in insParams]: -
branches/stable-2.0/extensions/MimeTexCGIBridge.py
r231 r234 103 103 # Run MimeTeX process 104 104 popenObject = subprocess.Popen(cmdline, shell=True, 105 stdout=subprocess.PIPE) 105 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 106 stderr=subprocess.PIPE) 107 106 108 childOut = popenObject.stdout 109 110 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 111 # why this is necessary 112 popenObject.stdin.close() 113 popenObject.stderr.close() 107 114 108 115 # Read stdout of process entirely -
branches/stable-2.0/extensions/PloticusClBridge.py
r231 r234 128 128 # childIn, childOut, childErr = os.popen3(cmdline, "b") 129 129 popenObject = subprocess.Popen(cmdline, shell=True, 130 stderr=subprocess.PIPE) 130 stderr=subprocess.PIPE, stdout=subprocess.PIPE, 131 stdin=subprocess.PIPE) 131 132 childErr = popenObject.stderr 132 133 134 # See http://bytes.com/topic/python/answers/634409-subprocess-handle-invalid-error 135 # why this is necessary 136 popenObject.stdin.close() 137 popenObject.stdout.close() 138 133 139 if u"noerror" in [a.strip() for a in insToken.appendices]: 134 140 childErr.read()
