nlp stuff

elimelt · Feb 12, 2025 · ef14146 · ef14146
1 parent 25df114
commit ef14146
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 33 deletions.
diff --git a/distributed-systems/managing-critical-state.md b/distributed-systems/managing-critical-state.md
@@ -1,3 +1,10 @@
+---
+title: Distributed Consensus Fundamentals
+category: other
+tags: Distributed Systems, Consensus Algorithms
+description: This document covers the fundamentals of distributed consensus algorithms, including leader election, replicated state machines, reliable datastores, and coordination services.
+---
+
 # Managing Critical State
 [reading](https://sre.google/sre-book/managing-critical-state/)
 

diff --git a/distributed-systems/non-blocking-two-phase-commit.md b/distributed-systems/non-blocking-two-phase-commit.md
@@ -1,3 +1,10 @@
+---
+title: Non-Blocking Two Phase Commit
+category: distributed-systems
+tags: paxos, two-phase commit, distributed transactions, consistency models
+description: Explains the concept of non-blocking two phase commit using Paxos
+---
+
 # Non-Blocking Two Phase Commit
 
 Regular 2PC is blocking because we need to wait for **all** nodes to agree that an operation is commit-able. There are massive performance implications to read-only transactions (could fix with snapshot reads) and lock contention. We can fix this by relying on Paxos.

diff --git a/natural-language-processing/ppmi.ipynb b/natural-language-processing/ppmi.ipynb
@@ -219,39 +219,6 @@
    "source": [
     "show_pmim_heatmap(ppmim, list(ds.vocab))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "max ppmi: 12.789616288115058, words: ('min_period', 'max_delay')\n"
-     ]
-    }
-   ],
-   "source": [
-    "max_ppmi = 0\n",
-    "mw = None, None\n",
-    "for word1 in ds.vocab:\n",
-    "    for word2 in ds.vocab:\n",
-    "        if word1 != word2:\n",
-    "            ppmi = ppmim.get_ppmi(word1, word2)\n",
-    "            max_ppmi = max(max_ppmi, ppmi)\n",
-    "            if ppmi == max_ppmi:\n",
-    "                mw = word1, word2\n",
-    "print(f'max ppmi: {max_ppmi}, words: {mw}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/natural-language-processing/tf-idf.py b/natural-language-processing/tf-idf.py
@@ -0,0 +1,128 @@
+import numpy as np
+import os
+from typing import Dict, List, Tuple
+
+
+def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:
+
+    corpus = {}
+
+    def process_dir(path: str, depth: int = 0):
+        if depth > max_depth:
+            return
+
+        try:
+            for entry in os.listdir(path):
+                full_path = os.path.join(path, entry)
+                if os.path.isdir(full_path):
+                    process_dir(full_path, depth + 1)
+                elif entry.endswith(".md"):
+                    try:
+                        with open(full_path, "r") as f:
+                            corpus[full_path] = f.read()
+                    except Exception:
+                        continue
+        except Exception:
+            return
+
+    process_dir(root_dir)
+    return corpus
+
+
+def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:
+
+    inv_idx = {}
+    for doc_name, content in corpus.items():
+        words = content.split()
+        for word in words:
+            if word not in inv_idx:
+                inv_idx[word] = {}
+            inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1
+
+    return inv_idx, list(inv_idx.keys())
+
+
+def calculate_tfidf(
+    corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
+) -> Dict[str, np.ndarray]:
+
+    N = len(corpus)
+    tfidf = {}
+
+    for doc_name, content in corpus.items():
+        doc_words = content.split()
+        doc_len = len(doc_words)
+        tfidf[doc_name] = np.zeros(len(word_list))
+
+        for i, word in enumerate(word_list):
+            if word in inv_idx and doc_name in inv_idx[word]:
+                tf = inv_idx[word][doc_name] / doc_len
+                idf = np.log(N / len(inv_idx[word]))
+                tfidf[doc_name][i] = tf * idf
+
+    return tfidf
+
+
+def search(
+    query: str,
+    tfidf: Dict[str, np.ndarray],
+    word_list: List[str],
+    inv_idx: Dict[str, Dict[str, int]],
+    num_results: int = 5,
+) -> List[Tuple[str, float]]:
+
+    N = len(tfidf)
+    query_vec = np.zeros(len(word_list))
+
+    for i, word in enumerate(word_list):
+        if word in query.split() and word in inv_idx:
+            query_vec[i] = np.log(N / len(inv_idx[word]))
+
+    similarities = [
+        (doc_name, float(np.dot(doc_vec, query_vec)))
+        for doc_name, doc_vec in tfidf.items()
+    ]
+
+    return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]
+
+
+def main():
+
+    try:
+        root_dir = (
+            input("Enter root directory to search (default '.'): ").strip() or "."
+        )
+        print(f"Reading markdown files from {root_dir}...")
+
+        corpus = read_markdown_files(root_dir)
+        if not corpus:
+            print("No markdown files found!")
+            return
+
+        print(f"Found {len(corpus)} markdown files")
+        print("Building search index...")
+
+        inv_idx, word_list = create_index(corpus)
+        tfidf = calculate_tfidf(corpus, inv_idx, word_list)
+
+        print("Search system ready!")
+
+        while True:
+            query = input('\nEnter search query (or "exit" to quit): ').strip()
+            if query.lower() == "exit":
+                break
+
+            results = search(query, tfidf, word_list, inv_idx)
+
+            print("\nSearch results:")
+            for doc_name, similarity in results:
+                print(f"{doc_name}: {similarity:.4f}")
+
+    except KeyboardInterrupt:
+        print("\nExiting...")
+    except Exception as e:
+        print(f"\nError: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()