Skip to content

Commit

Permalink
Merge pull request #250 from khoj-ai/features/github-multi-repo-and-more
Browse files Browse the repository at this point in the history
Support multiple Github repositories and support indexing of multiple file types
  • Loading branch information
sabaimran committed Jun 27, 2023
2 parents 5da6a5e + 9d62d66 commit c0d35ba
Show file tree
Hide file tree
Showing 12 changed files with 257 additions and 84 deletions.
21 changes: 18 additions & 3 deletions src/khoj/interface/web/chat.html
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,21 @@
window.onload = function () {
fetch('/api/chat?client=web')
.then(response => response.json())
.then(data => data.response)
.then(chat_logs => {
.then(data => {
if (data.detail) {
// If the server returns a 500 error with detail, render it as a message.
renderMessage(data.detail + " You can configure Khoj chat in your <a class='inline-chat-link' href='/config'>settings</a>.", "khoj");
}
return data.response;
})
.then(response => {
// Render conversation history, if any
chat_logs.forEach(chat_log => {
response.forEach(chat_log => {
renderMessageWithReference(chat_log.message, chat_log.by, chat_log.context, new Date(chat_log.created));
});
})
.catch(err => {
return;
});

// Set welcome message on load
Expand Down Expand Up @@ -235,6 +244,12 @@
font-size: medium;
}

a.inline-chat-link {
color: #475569;
text-decoration: none;
border-bottom: 1px dotted #475569;
}

@media (pointer: coarse), (hover: none) {
abbr[title] {
position: relative;
Expand Down
128 changes: 95 additions & 33 deletions src/khoj/interface/web/content_type_github_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,25 @@ <h2 class="section-title">
<input type="text" id="pat-token" name="pat" value="{{ current_config['pat_token'] }}">
</td>
</tr>
<tr>
<td>
<label for="repo-owner">Repository Owner</label>
</td>
<td>
<input type="text" id="repo-owner" name="repo_owner" value="{{ current_config['repo_owner'] }}">
</td>
</tr>
<tr>
<td>
<label for="repo-name">Repository Name</label>
</td>
<td>
<input type="text" id="repo-name" name="repo_name" value="{{ current_config['repo_name'] }}">
</td>
</tr>
<tr>
<td>
<label for="repo-branch">Repository Branch</label>
</td>
<td>
<input type="text" id="repo-branch" name="repo_branch" value="{{ current_config['repo_branch'] }}">
</td>
</tr>
</table>
<h4>Repositories</h4>
<div id="repositories" class="section-cards">
{% for repo in current_config['repos'] %}
<div class="card repo" id="repo-card-{{loop.index}}">
<label for="repo-owner">Repository Owner</label>
<input type="text" id="repo-owner-{{loop.index}}" name="repo_owner" value="{{ repo.owner }}">
<label for="repo-name">Repository Name</label>
<input type="text" id="repo-name-{{loop.index}}" name="repo_name" value="{{ repo.name}}">
<label for="repo-branch">Repository Branch</label>
<input type="text" id="repo-branch-{{loop.index}}" name="repo_branch" value="{{ repo.branch }}">
<button type="button"
class="remove-repo-button"
onclick="remove_repo({{loop.index}})"
id="remove-repo-button-{{loop.index}}">Remove Repository</button>
</div>
{% endfor %}
</div>
<button type="button" id="add-repository-button">Add Repository</button>
<h4>You probably don't need to edit these.</h4>

<table>
Expand Down Expand Up @@ -68,16 +62,86 @@ <h4>You probably don't need to edit these.</h4>
</form>
</div>
</div>
<style>
div.repo {
width: 100%;
height: 100%;
grid-template-rows: none;
}
div#repositories {
margin-bottom: 12px;
}
button.remove-repo-button {
background-color: gainsboro;
}
</style>
<script>
const add_repo_button = document.getElementById("add-repository-button");
add_repo_button.addEventListener("click", function(event) {
event.preventDefault();
var repo = document.createElement("div");
repo.classList.add("card");
repo.classList.add("repo");
const id = Date.now();
repo.id = "repo-card-" + id;
repo.innerHTML = `
<label for="repo-owner">Repository Owner</label>
<input type="text" id="repo-owner" name="repo_owner">
<label for="repo-name">Repository Name</label>
<input type="text" id="repo-name" name="repo_name">
<label for="repo-branch">Repository Branch</label>
<input type="text" id="repo-branch" name="repo_branch">
<button type="button"
class="remove-repo-button"
onclick="remove_repo(${id})"
id="remove-repo-button-${id}">Remove Repository</button>
`;
document.getElementById("repositories").appendChild(repo);
})

function remove_repo(index) {
document.getElementById("repo-card-" + index).remove();
}

submit.addEventListener("click", function(event) {
event.preventDefault();

var compressed_jsonl = document.getElementById("compressed-jsonl").value;
var embeddings_file = document.getElementById("embeddings-file").value;
var pat_token = document.getElementById("pat-token").value;
var repo_owner = document.getElementById("repo-owner").value;
var repo_name = document.getElementById("repo-name").value;
var repo_branch = document.getElementById("repo-branch").value;
const compressed_jsonl = document.getElementById("compressed-jsonl").value;
const embeddings_file = document.getElementById("embeddings-file").value;
const pat_token = document.getElementById("pat-token").value;

if (pat_token == "") {
document.getElementById("success").innerHTML = "❌ Please enter a Personal Access Token.";
document.getElementById("success").style.display = "block";
return;
}


var cards = document.getElementById("repositories").getElementsByClassName("repo");
var repos = [];

for (var i = 0; i < cards.length; i++) {
var card = cards[i];
var owner = card.getElementsByTagName("input")[0].value;
var name = card.getElementsByTagName("input")[1].value;
var branch = card.getElementsByTagName("input")[2].value;

if (owner == "" || name == "" || branch == "") {
continue;
}

repos.push({
"owner": owner,
"name": name,
"branch": branch,
});
}

if (repos.length == 0) {
document.getElementById("success").innerHTML = "❌ Please add at least one repository.";
document.getElementById("success").style.display = "block";
return;
}

const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
fetch('/api/config/data/content_type/github', {
Expand All @@ -88,9 +152,7 @@ <h4>You probably don't need to edit these.</h4>
},
body: JSON.stringify({
"pat_token": pat_token,
"repo_owner": repo_owner,
"repo_name": repo_name,
"repo_branch": repo_branch,
"repos": repos,
"compressed_jsonl": compressed_jsonl,
"embeddings_file": embeddings_file,
})
Expand Down
23 changes: 22 additions & 1 deletion src/khoj/interface/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,27 @@
}).join("\n") + `</div>`;
}

function render_mutliple(query, data, type) {
let org_files = data.filter((item) => item.additional.file.endsWith(".org"));
let md_files = data.filter((item) => item.additional.file.endsWith(".md"));
let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf"));

let html = "";
if (org_files.length > 0) {
html += render_org(query, org_files, type);
}

if (md_files.length > 0) {
html += render_markdown(query, md_files);
}

if (pdf_files.length > 0) {
html += render_pdf(query, pdf_files);
}

return html;
}

function render_json(data, query, type) {
if (type === "markdown") {
return render_markdown(query, data);
Expand All @@ -71,7 +92,7 @@
} else if (type === "pdf") {
return render_pdf(query, data);
} else if (type == "github") {
return render_markdown(query, data);
return render_mutliple(query, data, type);
} else {
return `<div id="results-plugin">`
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
Expand Down
75 changes: 56 additions & 19 deletions src/khoj/processor/github/github_to_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

# Internal Packages
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, GithubContentConfig
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data

Expand All @@ -21,7 +22,6 @@ class GithubToJsonl(TextToJsonl):
def __init__(self, config: GithubContentConfig):
super().__init__(config)
self.config = config
self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"

@staticmethod
def wait_for_rate_limit_reset(response, func, *args, **kwargs):
Expand All @@ -34,26 +34,43 @@ def wait_for_rate_limit_reset(response, func, *args, **kwargs):
return

def process(self, previous_entries=None):
current_entries = []
for repo in self.config.repos:
current_entries += self.process_repo(repo, previous_entries)

return self.update_entries_with_ids(current_entries, previous_entries)

def process_repo(self, repo: GithubRepoConfig, previous_entries=None):
repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
repo_shorthand = f"{repo.owner}/{repo.name}"
logger.info(f"Processing github repo {repo_shorthand}")
with timer("Download markdown files from github repo", logger):
try:
docs = self.get_markdown_files()
markdown_files, org_files = self.get_files(repo_url, repo)
except Exception as e:
logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
logger.error(f"Unable to download github repo {repo_shorthand}")
raise e

logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")
logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}")
logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}")

with timer("Extract markdown entries from github repo", logger):
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
*GithubToJsonl.extract_markdown_entries(docs)
*GithubToJsonl.extract_markdown_entries(markdown_files)
)

with timer("Extract commit messages from github repo", logger):
current_entries += self.convert_commits_to_entries(self.get_commits())
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))

with timer("Split entries by max token size supported by model", logger):
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)

with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)

return current_entries

def update_entries_with_ids(self, current_entries, previous_entries):
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger):
if not previous_entries:
Expand All @@ -76,31 +93,40 @@ def process(self, previous_entries=None):

return entries_with_ids

def get_markdown_files(self):
def get_files(self, repo_url: str, repo: GithubRepoConfig):
# Get the contents of the repository
repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
headers = {"Authorization": f"token {self.config.pat_token}"}
params = {"recursive": "true"}
response = requests.get(repo_content_url, headers=headers, params=params)
contents = response.json()

# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self.get_markdown_files)
result = self.wait_for_rate_limit_reset(response, self.get_files)
if result is not None:
return result

# Extract markdown files from the repository
markdown_files = []
org_files = []
for item in contents["tree"]:
# Find all markdown files in the repository
if item["type"] == "blob" and item["path"].endswith(".md"):
# Create URL for each markdown file on Github
url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'

# Add markdown file contents and URL to list
markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]

return markdown_files
# Find all org files in the repository
elif item["type"] == "blob" and item["path"].endswith(".org"):
# Create URL for each org file on Github
url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'

# Add org file contents and URL to list
org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]

return markdown_files, org_files

def get_file_contents(self, file_url):
# Get text from each markdown file
Expand All @@ -114,9 +140,9 @@ def get_file_contents(self, file_url):

return response.content.decode("utf-8")

def get_commits(self) -> List[Dict]:
def get_commits(self, repo_url: str) -> List[Dict]:
# Get commit messages from the repository using the Github API
commits_url = f"{self.repo_url}/commits"
commits_url = f"{repo_url}/commits"
headers = {"Authorization": f"token {self.config.pat_token}"}
params = {"per_page": 100}
commits = []
Expand All @@ -140,10 +166,10 @@ def get_commits(self) -> List[Dict]:

return commits

def convert_commits_to_entries(self, commits) -> List[Entry]:
def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
entries: List[Entry] = []
for commit in commits:
compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}'
entries.append(
Entry(
compiled=compiled,
Expand All @@ -164,3 +190,14 @@ def extract_markdown_entries(markdown_files):
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)

@staticmethod
def extract_org_entries(org_files):
entries = []
entry_to_file_map = []

for doc in org_files:
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)
Loading

0 comments on commit c0d35ba

Please sign in to comment.