python
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def preprocess(self, text):
文本预处理:转小写、分词、去停用词、词干化
text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
tokens = word_tokenize(text)
tokens = [self.stemmer.stem(token) for token in tokens if token not in self.stop_words]
return tokens
def add_document(self, doc_id, content):
添加文档到搜索引擎
self.documents[doc_id] = content
words = self.preprocess(content)
for word in words:
self.inverted_index[word].add(doc_id)
for word in query_words:
if word in self.inverted_index:
if result_docs is None:
result_docs = self.inverted_index[word].copy()
else:
result_docs &= self.inverted_index[word] 交集(AND语义)
else:
result_docs = set() 如果任一词无匹配,则结果为空
break
result_docs = list(result_docs) if result_docs else []
按关键词在文档中出现频率简单打分排序
scored_docs = []
for doc_id in result_docs:
content = self.documents[doc_id].lower()
score = sum(content.count(word) for word in query_words)
scored_docs.append((doc_id, score))
scored_docs.sort(key=lambda x: -x[1])
return [doc_id for doc_id, _ in scored_docs[:top_k]]
=== 使用示例 ===
if __name__ == __main__:
engine = SimpleSearchEngine()
添加一些测试文档
engine.add_document(1, Python is a powerful programming language.)
engine.add_document(2, I love coding in Python for data science.)
engine.add_document(3, Machine learning uses Python and R.)
engine.add_document(4, Java is another popular programming language.)
engine.add_document(5, Web development with JavaScript and Python.)
搜索
query = programming language Python
results = engine.search(query)
print(fSearch results for '{query}':)
for doc_id in results:
print(fDoc {doc_id}: {engine.documents[doc_id]})
🔍 输出示例:
Search results for 'programming language Python':
Doc 1: Python is a powerful programming language.
Doc 2: I love coding in Python for data science.
Doc 5: Web development with JavaScript and Python.
💡 示例:支持 OR 的搜索
python
def search_or(self, query):
query_words = self.preprocess(query)
result_docs = set()
for word in query_words:
if word in self.inverted_index:
result_docs |= self.inverted_index[word]
return list(result_docs)
python
import os
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
if not os.path.exists(indexdir):
os.mkdir(indexdir)
ix = create_in(indexdir, schema)
writer = ix.writer()
writer.add_document(title=Python Guide, content=Learn Python programming language)
writer.add_document(title=Data Science, content=Python is used in data science)
writer.commit()
with ix.searcher() as searcher:
query = QueryParser(content, ix.schema).parse(python)
results = searcher.search(query)
for r in results:
print(r['title'], r['content'])