before the great tagging

elimelt · Feb 11, 2025 · 645a1fc · 645a1fc
1 parent c875a17
commit 645a1fc
Show file tree

Hide file tree

Showing 27 changed files with 417 additions and 140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 *~
 .vscode
 .DS_Store
-venv
+venv
+*.pyc
diff --git a/algorithms/DAGs.md b/algorithms/DAGs.md
@@ -1,11 +1,7 @@
 ---
 title: Topological Ordering and Properties of Directed Acyclic Graphs
 category: algorithms
-tags:
-  - graph theory
-  - topological sorting
-  - directed acyclic graphs
-  - proofs
+tags: graph theory, topological sorting, directed acyclic graphs, proofs
 description: A technical exploration of Directed Acyclic Graphs (DAGs) focusing on their topological ordering properties and fundamental lemmas. The document includes mathematical proofs of key DAG properties and presents a Python implementation of the topological sorting algorithm.
 ---
 

diff --git a/algorithms/divide-and-conquer.md b/algorithms/divide-and-conquer.md
@@ -1,11 +1,7 @@
 ---
 title: Divide and Conquer Algorithm Analysis with Implementation Examples
 category: algorithms
-tags:
-  - divide-and-conquer
-  - algorithmic-complexity
-  - recursive-algorithms
-  - computational-geometry
+tags: divide-and-conquer, algorithmic-complexity, recursive-algorithms, computational-geometry
 description: A comprehensive examination of divide and conquer algorithmic strategies, focusing on their implementation and analysis. The document covers theoretical foundations with mathematical proofs, practical examples including bisection method and closest pair problem, and includes Python implementations demonstrating these concepts.
 ---
 

diff --git a/algorithms/greedy-algorithms.md b/algorithms/greedy-algorithms.md
@@ -1,7 +1,7 @@
 ---
 title: Greedy Algorithms for Interval Scheduling and Partitioning
 category: Algorithm Analysis
-tags: algorithms, interval, scheduling, partitioning, greedy-algorithms
+tags: algorithms, interval, scheduling, partitioning, greedy
 description: This document explores greedy algorithms for interval scheduling and partitioning problems. It provides detailed explanations of the algorithms, including Python implementations, and presents rigorous proofs of correctness using techniques such as "Greedy Stays Ahead" and exchange arguments.
 ---
 

diff --git a/distributed-systems/dynamo-db.md b/distributed-systems/dynamo-db.md
@@ -1,3 +1,9 @@
+---
+title: Dynamo: Amazon's Highly Available Key-value Store
+category: database-systems
+tags: key-value store, database-design, high-availability, consistency, object-versioning, conflict-resolution
+description: A highly available key-value storage system sacrificing consistency under failure conditions, using object versioning and application assisted conflict resolution.
+---
 # Dynamo: Amazon's Highly Available Key-value Store
 
 [reading](https://dl-acm-org.offcampus.lib.washington.edu/doi/pdf/10.1145/1323293.1294281)

diff --git a/natural-language-processing/__init__.py b/natural-language-processing/__init__.py
diff --git a/natural-language-processing/ppmi.ipynb b/natural-language-processing/ppmi.ipynb
diff --git a/natural-language-processing/reading/logistic-regression.pdf b/natural-language-processing/reading/logistic-regression.pdf
diff --git a/natural-language-processing/reading/masked-lang-models.pdf b/natural-language-processing/reading/masked-lang-models.pdf
diff --git a/natural-language-processing/reading/naive-bayes-text-classif-sentiment.pdf b/natural-language-processing/reading/naive-bayes-text-classif-sentiment.pdf
diff --git a/natural-language-processing/reading/neural-networks.pdf b/natural-language-processing/reading/neural-networks.pdf
diff --git a/natural-language-processing/reading/qa-ir-and-rag.pdf b/natural-language-processing/reading/qa-ir-and-rag.pdf
diff --git a/natural-language-processing/reading/regex-tokenization-edit-distance.pdf b/natural-language-processing/reading/regex-tokenization-edit-distance.pdf
diff --git a/natural-language-processing/reading/vector-semantics-and-embeddings.pdf b/natural-language-processing/reading/vector-semantics-and-embeddings.pdf
diff --git a/natural-language-processing/tf-idf.py b/natural-language-processing/tf-idf.py
@@ -1,128 +0,0 @@
-import numpy as np
-import os
-from typing import Dict, List, Tuple
-
-
-def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:
-
-    corpus = {}
-
-    def process_dir(path: str, depth: int = 0):
-        if depth > max_depth:
-            return
-
-        try:
-            for entry in os.listdir(path):
-                full_path = os.path.join(path, entry)
-                if os.path.isdir(full_path):
-                    process_dir(full_path, depth + 1)
-                elif entry.endswith(".md"):
-                    try:
-                        with open(full_path, "r") as f:
-                            corpus[full_path] = f.read()
-                    except Exception:
-                        continue
-        except Exception:
-            return
-
-    process_dir(root_dir)
-    return corpus
-
-
-def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:
-
-    inv_idx = {}
-    for doc_name, content in corpus.items():
-        words = content.split()
-        for word in words:
-            if word not in inv_idx:
-                inv_idx[word] = {}
-            inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1
-
-    return inv_idx, list(inv_idx.keys())
-
-
-def calculate_tfidf(
-    corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
-) -> Dict[str, np.ndarray]:
-
-    N = len(corpus)
-    tfidf = {}
-
-    for doc_name, content in corpus.items():
-        doc_words = content.split()
-        doc_len = len(doc_words)
-        tfidf[doc_name] = np.zeros(len(word_list))
-
-        for i, word in enumerate(word_list):
-            if word in inv_idx and doc_name in inv_idx[word]:
-                tf = inv_idx[word][doc_name] / doc_len
-                idf = np.log(N / len(inv_idx[word]))
-                tfidf[doc_name][i] = tf * idf
-
-    return tfidf
-
-
-def search(
-    query: str,
-    tfidf: Dict[str, np.ndarray],
-    word_list: List[str],
-    inv_idx: Dict[str, Dict[str, int]],
-    num_results: int = 5,
-) -> List[Tuple[str, float]]:
-
-    N = len(tfidf)
-    query_vec = np.zeros(len(word_list))
-
-    for i, word in enumerate(word_list):
-        if word in query.split() and word in inv_idx:
-            query_vec[i] = np.log(N / len(inv_idx[word]))
-
-    similarities = [
-        (doc_name, float(np.dot(doc_vec, query_vec)))
-        for doc_name, doc_vec in tfidf.items()
-    ]
-
-    return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]
-
-
-def main():
-
-    try:
-        root_dir = (
-            input("Enter root directory to search (default '.'): ").strip() or "."
-        )
-        print(f"Reading markdown files from {root_dir}...")
-
-        corpus = read_markdown_files(root_dir)
-        if not corpus:
-            print("No markdown files found!")
-            return
-
-        print(f"Found {len(corpus)} markdown files")
-        print("Building search index...")
-
-        inv_idx, word_list = create_index(corpus)
-        tfidf = calculate_tfidf(corpus, inv_idx, word_list)
-
-        print("Search system ready!")
-
-        while True:
-            query = input('\nEnter search query (or "exit" to quit): ').strip()
-            if query.lower() == "exit":
-                break
-
-            results = search(query, tfidf, word_list, inv_idx)
-
-            print("\nSearch results:")
-            for doc_name, similarity in results:
-                print(f"{doc_name}: {similarity:.4f}")
-
-    except KeyboardInterrupt:
-        print("\nExiting...")
-    except Exception as e:
-        print(f"\nError: {str(e)}")
-
-
-if __name__ == "__main__":
-    main()

diff --git a/natural-language-processing/tfidf.py b/natural-language-processing/tfidf.py
@@ -0,0 +1,128 @@
+import numpy as np
+import os
+from typing import Dict, List, Tuple
+
+
+def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:
+
+    corpus = {}
+
+    def process_dir(path: str, depth: int = 0):
+        if depth > max_depth:
+            return
+
+        try:
+            for entry in os.listdir(path):
+                full_path = os.path.join(path, entry)
+                if os.path.isdir(full_path):
+                    process_dir(full_path, depth + 1)
+                elif entry.endswith(".md"):
+                    try:
+                        with open(full_path, "r") as f:
+                            corpus[full_path] = f.read()
+                    except Exception:
+                        continue
+        except Exception:
+            return
+
+    process_dir(root_dir)
+    return corpus
+
+
+def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:
+
+    inv_idx = {}
+    for doc_name, content in corpus.items():
+        words = content.split()
+        for word in words:
+            if word not in inv_idx:
+                inv_idx[word] = {}
+            inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1
+
+    return inv_idx, list(inv_idx.keys())
+
+
+def calculate_tfidf(
+    corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
+) -> Dict[str, np.ndarray]:
+
+    N = len(corpus)
+    tfidf = {}
+
+    for doc_name, content in corpus.items():
+        doc_words = content.split()
+        doc_len = len(doc_words)
+        tfidf[doc_name] = np.zeros(len(word_list))
+
+        for i, word in enumerate(word_list):
+            if word in inv_idx and doc_name in inv_idx[word]:
+                tf = inv_idx[word][doc_name] / doc_len
+                idf = np.log(N / len(inv_idx[word]))
+                tfidf[doc_name][i] = tf * idf
+
+    return tfidf
+
+
+def search(
+    query: str,
+    tfidf: Dict[str, np.ndarray],
+    word_list: List[str],
+    inv_idx: Dict[str, Dict[str, int]],
+    num_results: int = 5,
+) -> List[Tuple[str, float]]:
+
+    N = len(tfidf)
+    query_vec = np.zeros(len(word_list))
+
+    for i, word in enumerate(word_list):
+        if word in query.split() and word in inv_idx:
+            query_vec[i] = np.log(N / len(inv_idx[word]))
+
+    similarities = [
+        (doc_name, float(np.dot(doc_vec, query_vec)))
+        for doc_name, doc_vec in tfidf.items()
+    ]
+
+    return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]
+
+
+def main():
+
+    try:
+        root_dir = (
+            input("Enter root directory to search (default '.'): ").strip() or "."
+        )
+        print(f"Reading markdown files from {root_dir}...")
+
+        corpus = read_markdown_files(root_dir)
+        if not corpus:
+            print("No markdown files found!")
+            return
+
+        print(f"Found {len(corpus)} markdown files")
+        print("Building search index...")
+
+        inv_idx, word_list = create_index(corpus)
+        tfidf = calculate_tfidf(corpus, inv_idx, word_list)
+
+        print("Search system ready!")
+
+        while True:
+            query = input('\nEnter search query (or "exit" to quit): ').strip()
+            if query.lower() == "exit":
+                break
+
+            results = search(query, tfidf, word_list, inv_idx)
+
+            print("\nSearch results:")
+            for doc_name, similarity in results:
+                print(f"{doc_name}: {similarity:.4f}")
+
+    except KeyboardInterrupt:
+        print("\nExiting...")
+    except Exception as e:
+        print(f"\nError: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/papers/cluster-based-scalable-network-services.pdf b/papers/cluster-based-scalable-network-services.pdf
diff --git a/papers/gfs.pdf b/papers/gfs.pdf
diff --git a/papers/google-cluster-architecture.pdf b/papers/google-cluster-architecture.pdf
diff --git a/papers/lessons-from-giant-scale-services.pdf b/papers/lessons-from-giant-scale-services.pdf
diff --git a/papers/mapreduce-SDPoLC.pdf b/papers/mapreduce-SDPoLC.pdf
diff --git a/papers/mercator-scalable-extensible-web-crawler.pdf b/papers/mercator-scalable-extensible-web-crawler.pdf
diff --git a/papers/systems-and-databases-search-engine-perspective.pdf b/papers/systems-and-databases-search-engine-perspective.pdf
diff --git a/systems-research/papers/amorph-os.pdf b/systems-research/papers/amorph-os.pdf
diff --git a/systems-research/papers/barrelfish.pdf b/systems-research/papers/barrelfish.pdf
diff --git a/systems-research/papers/do-os-abstractions-make-sense-on-fpgas.pdf b/systems-research/papers/do-os-abstractions-make-sense-on-fpgas.pdf
diff --git a/systems-research/papers/linux-scalability-to-many-cores.pdf b/systems-research/papers/linux-scalability-to-many-cores.pdf
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ @@
     *~
     .vscode
     .DS_Store
-    venv
+    venv
+    *.pyc