Skip to content

Commit

Permalink
🐛 Bug: Fix the bug of not finding the lxml library when crawling webs…
Browse files Browse the repository at this point in the history
…ite content.
  • Loading branch information
yym68686 committed Apr 24, 2024
1 parent da29338 commit 2a49e48
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ md2tgmd==0.1.2
# jieba
python-dotenv
beautifulsoup4
lxml
python-telegram-bot[webhooks,rate-limiter]==21.0.1
# python-telegram-bot[webhooks,rate-limiter]==20.6

Expand Down
7 changes: 4 additions & 3 deletions test/test_Web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def jina_ai_Web_crawler(url: str, isSearch=False) -> str:
# for url in ['https://s.weibo.com/top/summary?cate=realtimehot']:
# for url in ['https://tophub.today/n/KqndgxeLl9']:
# for url in ['https://support.apple.com/zh-cn/HT213931']:
for url in ['https://finance.sina.com.cn/stock/roll/2023-06-26/doc-imyyrexk4053724.shtml', 'https://s.weibo.com/top/summary?cate=realtimehot', 'https://tophub.today/n/KqndgxeLl9', 'https://www.whatsonweibo.com/', 'https://www.trendingonweibo.com/?ref=producthunt', 'https://www.trendingonweibo.com/', 'https://www.statista.com/statistics/1377073/china-most-popular-news-on-weibo/']:
for url in ["https://zeta.zeabur.app", "https://www.anthropic.com/research/probes-catch-sleeper-agents"]:
# for url in ['https://finance.sina.com.cn/stock/roll/2023-06-26/doc-imyyrexk4053724.shtml', 'https://s.weibo.com/top/summary?cate=realtimehot', 'https://tophub.today/n/KqndgxeLl9', 'https://www.whatsonweibo.com/', 'https://www.trendingonweibo.com/?ref=producthunt', 'https://www.trendingonweibo.com/', 'https://www.statista.com/statistics/1377073/china-most-popular-news-on-weibo/']:
# for url in ['https://www.usnews.com/news/entertainment/articles/2023-12-22/china-drafts-new-rules-proposing-restrictions-on-online-gaming']:
# for url in ['https://developer.aliyun.com/article/721836']:
# for url in ['https://cn.aliyun.com/page-source/price/detail/machinelearning_price']:
Expand All @@ -181,8 +182,8 @@ def jina_ai_Web_crawler(url: str, isSearch=False) -> str:
# for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:
# for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
# for url in ['https://zhuanlan.zhihu.com/p/646786536', 'https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD', 'https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']:
# Web_crawler(url)
jina_ai_Web_crawler(url)
Web_crawler(url)
# jina_ai_Web_crawler(url)
print('-----------------------------')
end_time = time.time()
run_time = end_time - start_time
Expand Down

0 comments on commit 2a49e48

Please sign in to comment.