From 8d963f7e2013531192a2c29a06845dbae1c3fe3f Mon Sep 17 00:00:00 2001 From: Evgenii Khoroshev Date: Tue, 5 Dec 2023 19:09:37 +0800 Subject: [PATCH] duckduckgo search and websearch retriever --- src/libs/LangChain.Core/LangChain.Core.csproj | 5 +- .../Retrievers/WebSearchRetriever.cs | 32 ++ .../Utilities/DuckDuckGoSearch.cs | 396 ++++++++++++++++++ .../Utilities/DuckDuckGoSearchAPIWrapper.cs | 101 +++++ .../LangChain.Core/Utilities/IWebSearch.cs | 14 + .../Utilities/DuckDuckGoSearchTests.cs | 42 ++ 6 files changed, 586 insertions(+), 4 deletions(-) create mode 100644 src/libs/LangChain.Core/Retrievers/WebSearchRetriever.cs create mode 100644 src/libs/LangChain.Core/Utilities/DuckDuckGoSearch.cs create mode 100644 src/libs/LangChain.Core/Utilities/DuckDuckGoSearchAPIWrapper.cs create mode 100644 src/libs/LangChain.Core/Utilities/IWebSearch.cs create mode 100644 src/tests/LangChain.Core.UnitTests/Utilities/DuckDuckGoSearchTests.cs diff --git a/src/libs/LangChain.Core/LangChain.Core.csproj b/src/libs/LangChain.Core/LangChain.Core.csproj index c3105549..3869d81e 100644 --- a/src/libs/LangChain.Core/LangChain.Core.csproj +++ b/src/libs/LangChain.Core/LangChain.Core.csproj @@ -14,10 +14,6 @@ - - - - LangChain core classes. $(PackageTags);core @@ -32,6 +28,7 @@ + diff --git a/src/libs/LangChain.Core/Retrievers/WebSearchRetriever.cs b/src/libs/LangChain.Core/Retrievers/WebSearchRetriever.cs new file mode 100644 index 00000000..b97d07a1 --- /dev/null +++ b/src/libs/LangChain.Core/Retrievers/WebSearchRetriever.cs @@ -0,0 +1,32 @@ +using LangChain.Callback; +using LangChain.Docstore; +using LangChain.Utilities; + +namespace LangChain.Retrievers; + +public sealed class WebSearchRetriever : BaseRetriever +{ + private readonly IWebSearch _webSearch; + private readonly int _k; + + public WebSearchRetriever(IWebSearch webSearch, int k = 10) + { + _webSearch = webSearch; + _k = k; + } + + protected override async Task> GetRelevantDocumentsCoreAsync( + string query, + CallbackManagerForRetrieverRun runManager = null) + { + var searchResult = await _webSearch.ResultsAsync(query, _k); + + return searchResult.Select(v => new Document( + v.Body, + new Dictionary() + { + ["title"] = v.Title, + ["link"] = v.Link + })); + } +} \ No newline at end of file diff --git a/src/libs/LangChain.Core/Utilities/DuckDuckGoSearch.cs b/src/libs/LangChain.Core/Utilities/DuckDuckGoSearch.cs new file mode 100644 index 00000000..b7ba93e3 --- /dev/null +++ b/src/libs/LangChain.Core/Utilities/DuckDuckGoSearch.cs @@ -0,0 +1,396 @@ +using System.Net; +using System.Net.Http; +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Text.RegularExpressions; + +namespace LangChain.Utilities; + +/// +/// DuckDuckGo search client +/// +public sealed class DuckDuckGoSearch : IDisposable +{ + private readonly HttpClient _client = new( + new HttpClientHandler + { + AllowAutoRedirect = true, + MaxAutomaticRedirections = 2 + }); + + private readonly Regex _stringTagsRegex = new Regex("<.*?>", RegexOptions.Compiled); + private readonly Regex _regex500InUrl = new Regex("(?:\\d{3}-\\d{2}\\.js)", RegexOptions.Compiled); + + /// + /// DuckDuckGo text search generator. Query params: https://duckduckgo.com/params + /// + /// + /// + /// keywords for query + /// + /// + /// + /// + /// api, html, lite. Defaults to api. + /// api - collect data from https://duckduckgo.com, + /// html - collect data from https://html.duckduckgo.com, + /// lite - collect data from https://lite.duckduckgo.com. + /// + /// max number of results. If null, returns results only from the first response + /// + public async IAsyncEnumerable> TextSearchAsync( + string keywords, + string region = "wt-wt", + SafeSearchType safeSearch = SafeSearchType.Moderate, + TimeLimit? timeLimit = null, + int? maxResults = null) + { + var results = TextSearchApiAsync(keywords, region, safeSearch, timeLimit, maxResults); + var resultsCounter = 0; + await foreach (var result in results) + { + yield return result; + resultsCounter += 1; + if (maxResults != null && resultsCounter >= maxResults) + { + yield break; + } + } + } + + private async IAsyncEnumerable> TextSearchApiAsync( + string keywords, + string region, + SafeSearchType safeSearch, + TimeLimit? timeLimit, + int? maxResults) + { + var payload = await GetPayloadAsync(keywords, region, safeSearch, timeLimit); + + var i = 0; + var cache = new HashSet(); + while (i++ <= 10) + { + var response = await HttpGetAsync("https://links.duckduckgo.com/d.js", payload); + if (response.StatusCode != HttpStatusCode.OK) + { + yield break; + } + + LinksResponse.LinksResponseItem[]? pageData; + try + { + var contentRaw = await response.Content.ReadAsStringAsync(); + var content = JsonSerializer.Deserialize(contentRaw); + + pageData = content?.Results; + if (pageData == null || pageData.Length == 0) + { + yield break; + } + } + catch + { + yield break; + } + + string? nextPageUrl = null; + var resultExists = false; + foreach (var row in pageData) + { + var href = row.Url; + if (href != null && + !cache.Contains(href) && + href != $"http://www.google.com/search?q={keywords}") + { + cache.Add(href); + var body = NormalizeHtml(row.Body); + if (!String.IsNullOrEmpty(body)) + { + resultExists = true; + yield return new Dictionary + { + ["title"] = NormalizeHtml(row.Title), + ["href"] = NormalizeUrl(href), + ["body"] = body, + }; + } + } + else + { + nextPageUrl = row.NextPageUrl; + } + } + + if (maxResults == null || resultExists == false || String.IsNullOrEmpty(nextPageUrl)) + { + yield break; + } + + var separator = new[] { "s=" }; + payload["s"] = nextPageUrl.Split(separator, StringSplitOptions.RemoveEmptyEntries)[1].Split('&')[0]; + + await Sleep(); + } + } + + private async Task> GetPayloadAsync( + string keywords, + string region, + SafeSearchType safeSearch, TimeLimit? timeLimit) + { + var vqd = await GetVqdAsync(keywords); + + var timeLimitString = timeLimit switch + { + TimeLimit.Day => "d", + TimeLimit.Week => "w", + TimeLimit.Month => "m", + TimeLimit.Year => "y", + _ => String.Empty + }; + + var payload = new Dictionary + { + ["q"] = keywords, + ["kl"] = region, + ["l"] = region, + ["bing_market"] = region, + ["s"] = "0", + ["df"] = timeLimitString, + ["vqd"] = vqd, + ["o"] = "json", + ["sp"] = "0", + }; + + switch (safeSearch) + { + case SafeSearchType.Moderate: + payload["ex"] = "-1"; + break; + case SafeSearchType.Off: + payload["ex"] = "-2"; + break; + case SafeSearchType.On: + payload["p"] = "1"; + break; + default: + throw new ArgumentOutOfRangeException(nameof(safeSearch), safeSearch, null); + } + + return payload; + } + + /// + /// Unquote URL and replace spaces with '+' + /// + private static string NormalizeUrl(string url) + { + if (String.IsNullOrEmpty(url)) + { + return String.Empty; + } + + return WebUtility.UrlDecode(url.Replace(" ", "+")); + } + + /// + /// Strip HTML tags from the raw_html string. + /// + private string NormalizeHtml(string rawHtml) + { + if (String.IsNullOrEmpty(rawHtml)) + { + return String.Empty; + } + + var html = _stringTagsRegex.Replace(rawHtml, ""); + + return WebUtility.HtmlDecode(html); + } + + private class LinksResponse + { + [JsonInclude] + [JsonPropertyName("results")] + public LinksResponseItem[]? Results { get; private set; } + + public class LinksResponseItem + { + [JsonInclude] + [JsonPropertyName("u")] + public string? Url { get; private set; } + + [JsonInclude] + [JsonPropertyName("t")] + public string? Title { get; private set; } + + [JsonInclude] + [JsonPropertyName("a")] + public string? Body { get; private set; } + + [JsonInclude] + [JsonPropertyName("n")] + public string? NextPageUrl { get; private set; } + } + } + + /// + /// Sleep between API requests if proxies is None. + /// + private async Task Sleep() + { + // TODO: if (proxies == null) + await Task.Delay(750); + } + + /// + /// Get vqd value for a search query. + /// + /// + /// + private async Task GetVqdAsync(string keywords) + { + var resp = await HttpGetAsync( + "https://duckduckgo.com", + new Dictionary + { + ["q"] = keywords + }); + + if (resp.StatusCode == HttpStatusCode.OK) + { + var content = await resp.Content.ReadAsStringAsync(); + + var vqdIndex = content.IndexOf("vqd=", StringComparison.Ordinal); + if (vqdIndex > 0) + { + var start = vqdIndex + "vqd=".Length; + var nextChar = content[start]; + + char endToken; + if (nextChar == '\'') + { + start += 1; + endToken = '\''; + } + else if (nextChar == '\"') + { + start += 1; + endToken = '\"'; + } + else + { + endToken = '&'; + } + + var end = content.IndexOf(endToken, start); + + return content.Substring(start, end - start); + } + } + + throw new VqdExtractionException($"Could not extract vqd. {keywords}"); + } + + private static string AddQueryParamsToUrl(string baseUrl, Dictionary queryParameters) + { + var queryParts = new List(); + foreach (var queryParameter in queryParameters) + { + var encodedKey = WebUtility.UrlEncode(queryParameter.Key); + var encodedValue = WebUtility.UrlEncode(queryParameter.Value); + + queryParts.Add($"{encodedKey}={encodedValue}"); + } + + var url = $"{baseUrl}?{String.Join("&", queryParts)}"; + + return url; + } + + private async Task HttpGetAsync(string url, Dictionary queryParams) + { + var urlWithQuery = AddQueryParamsToUrl(url, queryParams); + + HttpResponseMessage responseMessage; + try + { + responseMessage = await _client.GetAsync(urlWithQuery); + } + catch (TaskCanceledException e) + { + throw new TimeoutException($"HttpGetAsync {urlWithQuery}", e); + } + catch (Exception e) + { + throw new DuckDuckGoSearchException($"HttpGetAsync {urlWithQuery}. {e.GetType()}: {e}", e); + } + + var lastUrl = responseMessage.RequestMessage?.RequestUri?.ToString(); + if (lastUrl != null && Is500InUrl(lastUrl)) + { + throw new ApiException($"HttpGetAsync {urlWithQuery}"); + } + + if (responseMessage.StatusCode == HttpStatusCode.Accepted) + { + throw new RateLimitException($"HttpGetAsync {urlWithQuery}"); + } + + if (responseMessage.StatusCode == HttpStatusCode.OK) + { + return responseMessage; + } + + throw new HttpRequestException($"HttpGetAsync finished with status code: {responseMessage.StatusCode}"); + } + + /// + /// something like '506-00.js' inside the url + /// + private bool Is500InUrl(string url) + { + return _regex500InUrl.IsMatch(url); + } + + /// + public class VqdExtractionException(string? message) + : Exception(message); + + /// + public class DuckDuckGoSearchException(string? message, Exception innerException) + : Exception(message, innerException); + + /// + public class ApiException(string? message) + : Exception(message); + + /// + public class TimeoutException(string? message, Exception innerException) + : Exception(message, innerException); + + /// + public class RateLimitException(string? message) + : Exception(message); + + public enum TimeLimit + { + Day, + Week, + Month, + Year, + } + + public enum SafeSearchType + { + On, + Moderate, + Off + } + + public void Dispose() + { + _client.Dispose(); + } +} \ No newline at end of file diff --git a/src/libs/LangChain.Core/Utilities/DuckDuckGoSearchAPIWrapper.cs b/src/libs/LangChain.Core/Utilities/DuckDuckGoSearchAPIWrapper.cs new file mode 100644 index 00000000..d1ccb8d3 --- /dev/null +++ b/src/libs/LangChain.Core/Utilities/DuckDuckGoSearchAPIWrapper.cs @@ -0,0 +1,101 @@ +namespace LangChain.Utilities; + +/// +/// Wrapper for DuckDuckGo Search API. +/// +/// Free and does not require any setup. +/// +public sealed class DuckDuckGoSearchApiWrapper( + string region = "wt-wt", + DuckDuckGoSearch.SafeSearchType safeSearch = DuckDuckGoSearch.SafeSearchType.Moderate, + DuckDuckGoSearch.TimeLimit time = DuckDuckGoSearch.TimeLimit.Year, + int maxResults = 5) + : IWebSearch, IDisposable +{ + private readonly DuckDuckGoSearch _search = new(); + + /// + /// Get aggregated search result + /// + public async Task RunAsync(string query) + { + var snippets = await GetSnippetsAsync(query); + + return String.Join(" ", snippets); + } + + /// + /// Run query through DuckDuckGo and return concatenated results. + /// + public async Task> GetSnippetsAsync(string query) + { + var results = _search.TextSearchAsync( + query, + region: region, + safeSearch: safeSearch, + timeLimit: time); + + var snippets = new List(); + await foreach (var result in results) + { + snippets.Add(result["body"]); + + if (snippets.Count == maxResults) + { + break; + } + } + + if (snippets.Count == 0) + { + snippets.Add("No good DuckDuckGo Search Result was found"); + } + + return snippets; + } + + /// + /// Run query through DuckDuckGo and return metadata. + /// + /// + /// Only "api" backend supported + /// + /// The query to search for. + /// The number of results to return. + /// + /// A list of items with the following props: + /// title - The description of the result. + /// snippet - The title of the result. + /// link - The link to the result. + /// + public async Task> ResultsAsync( + string query, + int numResults) + { + var results = _search.TextSearchAsync( + query, + region: region, + safeSearch: safeSearch, + timeLimit: time, + maxResults: maxResults); + + var formattedResults = new List(); + await foreach (var result in results) + { + var formattedResult = new WebSearchResult(result["title"], result["body"], result["href"]); + formattedResults.Add(formattedResult); + + if (formattedResults.Count == numResults) + { + break; + } + } + + return formattedResults; + } + + public void Dispose() + { + _search.Dispose(); + } +} \ No newline at end of file diff --git a/src/libs/LangChain.Core/Utilities/IWebSearch.cs b/src/libs/LangChain.Core/Utilities/IWebSearch.cs new file mode 100644 index 00000000..30743425 --- /dev/null +++ b/src/libs/LangChain.Core/Utilities/IWebSearch.cs @@ -0,0 +1,14 @@ +namespace LangChain.Utilities; + +public interface IWebSearch +{ + Task RunAsync(string query); + Task> ResultsAsync(string query, int numResults); +} + +public class WebSearchResult(string title, string body, string link) +{ + public string Title { get; set; } = title; + public string Body { get; set; } = body; + public string Link { get; set; } = link; +} \ No newline at end of file diff --git a/src/tests/LangChain.Core.UnitTests/Utilities/DuckDuckGoSearchTests.cs b/src/tests/LangChain.Core.UnitTests/Utilities/DuckDuckGoSearchTests.cs new file mode 100644 index 00000000..6baf408e --- /dev/null +++ b/src/tests/LangChain.Core.UnitTests/Utilities/DuckDuckGoSearchTests.cs @@ -0,0 +1,42 @@ +using LangChain.Retrievers; +using LangChain.Utilities; + +namespace LangChain.Core.UnitTests.Utilities; + +[TestFixture] +public class DuckDuckGoSearchTests +{ + [Test] + public async Task Run_Ok() + { + var search = new DuckDuckGoSearchApiWrapper(); + + var result = await search.RunAsync("wikipedia"); + + result.Should().NotBeEmpty(); + result.Should().Contain("encyclopedia"); + } + + [Test] + public async Task GetSnippets_Ok() + { + var search = new DuckDuckGoSearchApiWrapper(); + + var result = await search.GetSnippetsAsync("wikipedia"); + + result.Should().NotBeEmpty(); + result.Should().Contain(v => v.Contains("encyclopedia")); + } + + [Test] + public async Task Retriever_Ok() + { + var search = new DuckDuckGoSearchApiWrapper(); + var retriever = new WebSearchRetriever(search); + + var result = await retriever.GetRelevantDocumentsAsync("wikipedia"); + + result.Should().NotBeEmpty(); + result.Should().Contain(d => d.PageContent.Contains("encyclopedia")); + } +} \ No newline at end of file