Skip to content

Commit

Permalink
Update dependencies and modify search functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
yym68686 committed Dec 19, 2023
1 parent 6ef7b59 commit e4ef199
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 23 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ python-telegram-bot[webhooks,rate-limiter]==20.6
fake_useragent
openai==0.28.1
google-api-python-client
duckduckgo-search==3.9.6
duckduckgo-search==4.1.0
langchain==0.0.271
oauth2client==3.0.0
pdfminer.six
Expand Down
85 changes: 72 additions & 13 deletions test/test_ddg.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,67 @@
import re
import time
import requests
import json
import os
from bs4 import BeautifulSoup
from langchain.tools import DuckDuckGoSearchResults
from duckduckgo_search import DDGS
def getddgsearchurl1(result, numresults=3):
requrl = f"https://html.duckduckgo.com/html?q={result}&kl=us-en&s=0&dc=0"
try:
response = requests.get(requrl)
soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
print(soup)
urls = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
urls.append(link.get('href'))
urls = urls[:numresults]
except Exception as e:
print('\033[31m')
print("duckduckgo error", e)
print('\033[0m')
urls = []
return urls

def search_duckduckgo(query):
url = 'https://duckduckgo.com/html/'
params = {
'q': query,
'ia': 'web'
}

response = requests.get(url, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
for link in soup.find_all('a', class_='result__url'):
print(link.get('href'))



from duckduckgo_search import DDGS

def getddg(result, numresults=3):
with DDGS(timeout=2) as ddgs:
results = [r["href"] for r in ddgs.text(result, max_results=numresults)]
# print(json.dumps(results, ensure_ascii=False, indent=4))
return results

def getddgsearchurl(result, numresults=3):
search = DuckDuckGoSearchResults(num_results=numresults)
webresult = search.run(result)
urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE)
try:
# webresult = getddg(result, numresults)
search = DuckDuckGoSearchResults(num_results=numresults)
webresult = search.run(result)
print(webresult)
if webresult == None:
return []
urls = re.findall(r"(https?://\S+)\]", webresult, re.MULTILINE)
except Exception as e:
print('\033[31m')
print("duckduckgo error", e)
print('\033[0m')
urls = []
return urls

urls = getddgsearchurl("你知道今天有什么热点新闻吗")
print(urls)

def Web_crawler(url: str) -> str:
"""返回链接网址url正文内容,必须是合法的网址"""
Expand Down Expand Up @@ -39,12 +89,21 @@ def Web_crawler(url: str) -> str:
print('\033[0m')
return result

start_time = time.time()
if __name__ == '__main__':
start_time = time.time()

for url in urls:
print(Web_crawler(url))
print('-----------------------------')
end_time = time.time()
run_time = end_time - start_time
# 打印运行时间
print(f"程序运行时间:{run_time}秒")
# search_duckduckgo('python programming')
# print(getddg("尊嘟假嘟 含义"))
# urls = getddgsearchurl("python programming")
# urls = getddgsearchurl1("test")
# urls = getddgsearchurl("你知道今天有什么热点新闻吗")
# urls = getddg("尊嘟假嘟 含义")
urls = getddgsearchurl("它会返回一个包含搜索结果的列表")
print(urls)
# for url in urls:
# print(Web_crawler(url))
# print('-----------------------------')
end_time = time.time()
run_time = end_time - start_time
# 打印运行时间
print(f"程序运行时间:{run_time}秒")
19 changes: 17 additions & 2 deletions test/test_tikitoken.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
import tiktoken
# tiktoken.get_encoding("cl100k_base")
tiktoken.get_encoding("cl100k_base")
tiktoken.model.MODEL_TO_ENCODING["claude-2.1"] = "cl100k_base"
encoding = tiktoken.encoding_for_model("claude-2.1")
tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
# encoding = tiktoken.encoding_for_model("claude-2.1")
encode_web_text_list = []
if encode_web_text_list == []:
encode_web_text_list = encoding.encode("Hello, my dog is cute")
print("len", len(encode_web_text_list))
function_response = encoding.decode(encode_web_text_list[:2])
print(function_response)
encode_web_text_list = encode_web_text_list[2:]
print(encode_web_text_list)
encode_web_text_list = [856, 5679, 374, 19369]
tiktoken.get_encoding("cl100k_base")
encoding1 = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
function_response = encoding1.decode(encode_web_text_list[:2])
print(function_response)
18 changes: 12 additions & 6 deletions utils/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def Web_crawler(url: str, isSearch=False) -> str:
# print("url content", result + "\n\n")
return result

def getddgsearchurl(result, numresults=3):
def getddgsearchurl(result, numresults=4):
try:
search = DuckDuckGoSearchResults(num_results=numresults)
webresult = search.run(result)
Expand All @@ -314,6 +314,7 @@ def getddgsearchurl(result, numresults=3):
print("duckduckgo error", e)
print('\033[0m')
urls = []
# print("ddg urls", urls)
return urls

def getgooglesearchurl(result, numresults=3):
Expand All @@ -332,6 +333,7 @@ def getgooglesearchurl(result, numresults=3):
if "rateLimitExceeded" in str(e):
print("Google API 每日调用频率已达上限,请明日再试!")
config.USE_GOOGLE = False
# print("google urls", urls)
return urls

def get_search_url(prompt, chainllm):
Expand Down Expand Up @@ -390,28 +392,32 @@ def get_search_url(prompt, chainllm):
search_threads = []
urls_set = []
if len(keywords) == 3:
search_url_num = 8
search_url_num = 4
if len(keywords) == 2:
search_url_num = 12
search_url_num = 6
if len(keywords) == 1:
search_url_num = 24
search_url_num = 12
# print(keywords)
if config.USE_GOOGLE:
search_thread = ThreadWithReturnValue(target=getgooglesearchurl, args=(keywords[0],search_url_num,))
search_thread.start()
search_threads.append(search_thread)
keywords = keywords.pop(0)

keywords.pop(0)
# print(keywords)
for keyword in keywords:
search_thread = ThreadWithReturnValue(target=getddgsearchurl, args=(keyword,search_url_num,))
search_thread.start()
search_threads.append(search_thread)
# exit(0)

for t in search_threads:
tmp = t.join()
urls_set += tmp
url_set_list = sorted(set(urls_set), key=lambda x: urls_set.index(x))
# cut_num = int(len(url_set_list) * 2 / 3)
url_pdf_set_list = [item for item in url_set_list if item.endswith(".pdf")]
url_set_list = [item for item in url_set_list if not item.endswith(".pdf")]
# return url_set_list[:cut_num], url_pdf_set_list
return url_set_list, url_pdf_set_list

def concat_url(threads):
Expand Down
3 changes: 2 additions & 1 deletion utils/chatgpt2api.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,8 @@ def ask_stream(
if self.conversation[convo_id][-1 - index]["role"] == "user":
self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "")
prompt = self.conversation[convo_id][-1 - index]["content"]
prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip()
if json.loads(full_response)["prompt"].strip() != prompt:
prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip()
print("\n\nprompt", prompt)
break
tiktoken.get_encoding("cl100k_base")
Expand Down

0 comments on commit e4ef199

Please sign in to comment.