Skip to content

Commit

Permalink
上传循环摇摆的动作
Browse files Browse the repository at this point in the history
  • Loading branch information
worm128 committed Mar 28, 2024
1 parent d1dd926 commit 009e1e9
Showing 1 changed file with 50 additions and 0 deletions.
50 changes: 50 additions & 0 deletions test/htmlspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import requests
from bs4 import BeautifulSoup
import jieba
import re

def crawl_web(url):
response = requests.get(url)
return response.text

def parse_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text()

def extract_content(html_content):
# 使用正则表达式去除HTML标签和多余空格
content = re.compile('<[^<]+?>').sub('', html_content)
content = re.compile('[\r\n\t]').sub('', content)
content = re.compile(' +').sub(' ', content)
return content.strip()

def tokenize(text):
return ' '.join(jieba.cut(text))

def search(query, documents):
# 使用jieba分词后搜索查询
query_tokens = tokenize(query)
results = []
for doc in documents:
if query_tokens in doc:
results.append(doc)
return results

# 示例使用
url = 'https://baike.baidu.com/item/油库里' # 替换为你想抓取的网站
html_content = crawl_web(url)
parsed_content = parse_html(html_content)
full_text = extract_content(parsed_content)

# 添加全文到搜索引擎
documents = [full_text]

# 用户输入查询
user_query = '' # 替换为用户输入的查询

# 执行搜索
results = search(user_query, documents)

# 输出结果
for result in results:
print(result)

0 comments on commit 009e1e9

Please sign in to comment.