Skip to content
This repository has been archived by the owner on Jan 2, 2025. It is now read-only.

Commit

Permalink
[BLO-1840] Determinism semantic search (first call) (#1121)
Browse files Browse the repository at this point in the history
* gpt3 temperature 0, and using modified rake that gives always same order and do not ignore underscores and slashes

* gpt3 temperature 0, and using modified rake that gives always same order and do not ignore underscores and slashes

* rebase and add modified github repo to Cargo toml

* handling the case where keywords is empty

* move stopwords removal logic into bleep

* attribution

* address review comments

* revert

* Update server/bleep/src/query/stopwords.rs

Co-authored-by: akshay <[email protected]>

* address more comments

---------

Co-authored-by: Gabriel Gordon-Hall <[email protected]>
Co-authored-by: akshay <[email protected]>
  • Loading branch information
3 people authored Nov 10, 2023
1 parent b6a1eb3 commit eec3634
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 110 deletions.
106 changes: 25 additions & 81 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion server/bleep/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ git-version = "0.3.5"
gix = { git = "https://github.com/BloopAI/gitoxide", version="0.55.2", features = ["blocking-http-transport-reqwest-rust-tls-no-trust-dns", "pack-cache-lru-static"] }

# semantic
rake = "0.1"
qdrant-client = { version = "1.5.0", default-features = false }
tiktoken-rs = "0.4.5"
tokenizers = { version = "0.14.0", default-features = false, features = ["progressbar", "cli", "onig", "esaxx_fast"] }
Expand Down
32 changes: 4 additions & 28 deletions server/bleep/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@ use std::{sync::Arc, time::Duration};

use anyhow::{anyhow, Context, Result};
use futures::{Future, TryStreamExt};
use once_cell::sync::OnceCell;
use rake::*;
use tokio::sync::mpsc::Sender;
use tracing::{debug, error, info, instrument};

use crate::{
analytics::{EventData, QueryEvent},
indexes::reader::{ContentDocument, FileDocument},
llm_gateway::{self, api::FunctionCall},
query::parser,
query::{parser, stopwords::remove_stopwords},
repo::RepoRef,
semantic,
webserver::{
Expand Down Expand Up @@ -43,19 +41,6 @@ mod tools {
pub mod proc;
}

static STOPWORDS: OnceCell<StopWords> = OnceCell::new();
static STOP_WORDS_LIST: &str = include_str!("stopwords.txt");

fn stop_words() -> &'static StopWords {
STOPWORDS.get_or_init(|| {
let mut sw = StopWords::new();
for w in STOP_WORDS_LIST.lines() {
sw.insert(w.to_string());
}
sw
})
}

pub enum Error {
Timeout(Duration),
Processing(anyhow::Error),
Expand Down Expand Up @@ -196,23 +181,14 @@ impl Agent {

// Always make a code search for the user query on the first exchange
if self.exchanges.len() == 1 {
// Extract keywords from the query
let keywords = {
let sw = stop_words();
let r = Rake::new(sw.clone());
let keywords = r.run(s);

if keywords.is_empty() {
let keys = remove_stopwords(s);
if keys.is_empty() {
s.clone()
} else {
keywords
.iter()
.map(|k| k.keyword.clone())
.collect::<Vec<_>>()
.join(" ")
keys
}
};

self.code_search(&keywords).await?;
}
s.clone()
Expand Down
1 change: 1 addition & 0 deletions server/bleep/src/agent/tools/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ impl Agent {
.llm_gateway
.clone()
.model("gpt-3.5-turbo-0613")
.temperature(0.0)
.chat(&prompt, None)
.await?;

Expand Down
1 change: 1 addition & 0 deletions server/bleep/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pub mod languages;
pub mod parser;
pub mod planner;
pub mod ranking;
pub mod stopwords;
Loading

0 comments on commit eec3634

Please sign in to comment.