Skip to content

Commit

Permalink
nlp stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
elimelt committed Feb 12, 2025
1 parent 25df114 commit ef14146
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 33 deletions.
7 changes: 7 additions & 0 deletions distributed-systems/managing-critical-state.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
---
title: Distributed Consensus Fundamentals
category: other
tags: Distributed Systems, Consensus Algorithms
description: This document covers the fundamentals of distributed consensus algorithms, including leader election, replicated state machines, reliable datastores, and coordination services.
---

# Managing Critical State
[reading](https://sre.google/sre-book/managing-critical-state/)

Expand Down
7 changes: 7 additions & 0 deletions distributed-systems/non-blocking-two-phase-commit.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
---
title: Non-Blocking Two Phase Commit
category: distributed-systems
tags: paxos, two-phase commit, distributed transactions, consistency models
description: Explains the concept of non-blocking two phase commit using Paxos
---

# Non-Blocking Two Phase Commit

Regular 2PC is blocking because we need to wait for **all** nodes to agree that an operation is commit-able. There are massive performance implications to read-only transactions (could fix with snapshot reads) and lock contention. We can fix this by relying on Paxos.
Expand Down
33 changes: 0 additions & 33 deletions natural-language-processing/ppmi.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -219,39 +219,6 @@
"source": [
"show_pmim_heatmap(ppmim, list(ds.vocab))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"max ppmi: 12.789616288115058, words: ('min_period', 'max_delay')\n"
]
}
],
"source": [
"max_ppmi = 0\n",
"mw = None, None\n",
"for word1 in ds.vocab:\n",
" for word2 in ds.vocab:\n",
" if word1 != word2:\n",
" ppmi = ppmim.get_ppmi(word1, word2)\n",
" max_ppmi = max(max_ppmi, ppmi)\n",
" if ppmi == max_ppmi:\n",
" mw = word1, word2\n",
"print(f'max ppmi: {max_ppmi}, words: {mw}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
128 changes: 128 additions & 0 deletions natural-language-processing/tf-idf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import os
from typing import Dict, List, Tuple


def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:

corpus = {}

def process_dir(path: str, depth: int = 0):
if depth > max_depth:
return

try:
for entry in os.listdir(path):
full_path = os.path.join(path, entry)
if os.path.isdir(full_path):
process_dir(full_path, depth + 1)
elif entry.endswith(".md"):
try:
with open(full_path, "r") as f:
corpus[full_path] = f.read()
except Exception:
continue
except Exception:
return

process_dir(root_dir)
return corpus


def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:

inv_idx = {}
for doc_name, content in corpus.items():
words = content.split()
for word in words:
if word not in inv_idx:
inv_idx[word] = {}
inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1

return inv_idx, list(inv_idx.keys())


def calculate_tfidf(
corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
) -> Dict[str, np.ndarray]:

N = len(corpus)
tfidf = {}

for doc_name, content in corpus.items():
doc_words = content.split()
doc_len = len(doc_words)
tfidf[doc_name] = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in inv_idx and doc_name in inv_idx[word]:
tf = inv_idx[word][doc_name] / doc_len
idf = np.log(N / len(inv_idx[word]))
tfidf[doc_name][i] = tf * idf

return tfidf


def search(
query: str,
tfidf: Dict[str, np.ndarray],
word_list: List[str],
inv_idx: Dict[str, Dict[str, int]],
num_results: int = 5,
) -> List[Tuple[str, float]]:

N = len(tfidf)
query_vec = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in query.split() and word in inv_idx:
query_vec[i] = np.log(N / len(inv_idx[word]))

similarities = [
(doc_name, float(np.dot(doc_vec, query_vec)))
for doc_name, doc_vec in tfidf.items()
]

return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]


def main():

try:
root_dir = (
input("Enter root directory to search (default '.'): ").strip() or "."
)
print(f"Reading markdown files from {root_dir}...")

corpus = read_markdown_files(root_dir)
if not corpus:
print("No markdown files found!")
return

print(f"Found {len(corpus)} markdown files")
print("Building search index...")

inv_idx, word_list = create_index(corpus)
tfidf = calculate_tfidf(corpus, inv_idx, word_list)

print("Search system ready!")

while True:
query = input('\nEnter search query (or "exit" to quit): ').strip()
if query.lower() == "exit":
break

results = search(query, tfidf, word_list, inv_idx)

print("\nSearch results:")
for doc_name, similarity in results:
print(f"{doc_name}: {similarity:.4f}")

except KeyboardInterrupt:
print("\nExiting...")
except Exception as e:
print(f"\nError: {str(e)}")


if __name__ == "__main__":
main()

0 comments on commit ef14146

Please sign in to comment.