sync changes:

- A idx/file_to_keywords.json - A idx/keyword_to_files.json - M scripts/build.py
elimelt · Jan 13, 2025 · 19d406c · 19d406c
1 parent 57a22de
commit 19d406c
Show file tree

Hide file tree

Showing 11 changed files with 443 additions and 6 deletions.
diff --git a/idx/file_to_keywords.json b/idx/file_to_keywords.json
diff --git a/idx/keyword_to_files.json b/idx/keyword_to_files.json
diff --git a/scripts/build.py b/scripts/build.py
@@ -47,6 +47,8 @@ class SiteGenerator:
         "__pycache__",
         "node_modules",
         ".github",
+        "nlp.venv",
+        "site",
         "venv",
         ".venv",
     }

diff --git a/scripts/keywords.py b/scripts/keywords.py
@@ -0,0 +1,116 @@
+import json
+import string
+import nltk
+
+from typing import List
+from pathlib import Path
+from keybert import KeyBERT
+
+nltk.download("stopwords")
+
+
+class DataReader:
+    def __init__(self, root_dir: str = "."):
+        self.root_dir = Path(root_dir)
+        self.ROOT = root_dir
+        self.IGNORED_PATHS = {
+            ".git",
+            "__pycache__",
+            "node_modules",
+            ".github",
+            "venv",
+            ".venv",
+            "nlp.venv",
+            "site",
+            "README.md",
+        }
+
+        self.SUPPORTED_EXTENSIONS = {".md", ".txt"}
+
+    def read_files(self) -> List[Path]:
+        return self._walk_directory(self.root_dir)
+
+    def _walk_directory(self, directory: Path):
+        """Walk through directory while respecting ignored paths"""
+        for item in directory.rglob("*"):
+            if not any(ignored in item.parts for ignored in self.IGNORED_PATHS):
+                if item.is_file() and item.suffix in self.SUPPORTED_EXTENSIONS:
+                    yield item
+
+
+class Normalizer:
+    def __init__(self):
+        self.lower_case = True
+        self.remove_punctuation = False
+        self.remove_stopwords = True
+
+        self.stop_words = set(nltk.corpus.stopwords.words("english"))
+
+    def normalize_doc(self, doc: str) -> str:
+        if self.lower_case:
+            doc = doc.lower()
+
+        if self.remove_punctuation:
+            doc = str.translate(doc, str.maketrans("", "", string.punctuation))
+
+        if self.remove_stopwords:
+            doc = " ".join(
+                [word for word in doc.split() if word not in self.stop_words]
+            )
+
+        return doc
+
+
+def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:
+    reader = DataReader()
+    normalizer = Normalizer()
+
+    keyword_map = {}
+
+    for i, file in enumerate(files):
+        with open(file) as f:
+            doc = normalizer.normalize_doc(f.read())
+
+            keywords = model.extract_keywords(
+                doc,
+                keyphrase_ngram_range=(1, 1),
+                stop_words="english",
+                use_mmr=True,
+                diversity=0.3,
+            )
+
+            keyword_map["/".join(file.parts)] = keywords
+
+    return keyword_map
+
+
+def aggregate(file_to_keywords: dict) -> dict:
+    keyword_to_files = {}
+    for file, keywords in file_to_keywords.items():
+        for keyword in keywords:
+            if keyword[0] not in keyword_to_files:
+                keyword_to_files[keyword[0]] = []
+            keyword_to_files[keyword[0]].append(file)
+    return keyword_to_files
+
+
+def write_idx_to_json_file(idx: dict, output_path: Path):
+    if output_path.exists():
+        output_path.unlink()
+
+    if not output_path.parent.exists():
+        output_path.parent.mkdir(parents=True)
+
+    with open(output_path, "w") as f:
+        json.dump(idx, f)
+
+
+if __name__ == "__main__":
+    reader = DataReader()
+    files = reader.read_files()
+    model = KeyBERT()
+
+    file_to_kw = get_kw_path_map(files, model)
+    kw_to_file = aggregate(file_to_kw)
+    write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
+    write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))
diff --git a/site/idx/file_to_keywords.json b/site/idx/file_to_keywords.json
diff --git a/site/idx/keyword_to_files.json b/site/idx/keyword_to_files.json
diff --git a/site/index.html b/site/index.html
@@ -189,7 +189,7 @@ <h1></h1>
 
             <div class="landing-stats">
                 <div class="stat-item">
-                    <span class="stat-value">151</span>
+                    <span class="stat-value">152</span>
                     <span class="stat-label">Notes</span>
                 </div>
                 <div class="stat-item">
@@ -205,6 +205,11 @@ <h1></h1>
                 <div class="recent-section">
                     <h2>Recent</h2>
 <ul class='recent-posts'>
+                <li>
+                    <a href="/test.html">Test</a>
+                    <span class="date">2025-01-12</span>
+
+                </li>
                 <li>
                     <a href="/systems-research/strong-inference.html">Strong Inference</a>
                     <span class="date">2025-01-12</span>
@@ -249,11 +254,6 @@ <h2>Recent</h2>
                     <a href="/designing-data-intensive-applications/part-1-foundations-of-data-systems/ch4-encoding-and-evolution.html">Encoding, Evolution, and Data Flow in Distributed Systems</a>
                     <span class="date">2025-01-01</span>
                     <span class="category">Distributed Systems</span>
-                </li>
-                <li>
-                    <a href="/designing-data-intensive-applications/part-1-foundations-of-data-systems/ch3-storage-and-retrieval.html">Storage and Retrieval Techniques for Database Systems</a>
-                    <span class="date">2025-01-01</span>
-                    <span class="category">Database Systems</span>
                 </li></ul>
                 </div>
                 <div class="categories-section">

diff --git a/site/scripts/build.py b/site/scripts/build.py
@@ -47,6 +47,8 @@ class SiteGenerator:
         "__pycache__",
         "node_modules",
         ".github",
+        "nlp.venv",
+        "site",
         "venv",
         ".venv",
     }

diff --git a/site/scripts/keywords.py b/site/scripts/keywords.py
@@ -0,0 +1,116 @@
+import json
+import string
+import nltk
+
+from typing import List
+from pathlib import Path
+from keybert import KeyBERT
+
+nltk.download("stopwords")
+
+
+class DataReader:
+    def __init__(self, root_dir: str = "."):
+        self.root_dir = Path(root_dir)
+        self.ROOT = root_dir
+        self.IGNORED_PATHS = {
+            ".git",
+            "__pycache__",
+            "node_modules",
+            ".github",
+            "venv",
+            ".venv",
+            "nlp.venv",
+            "site",
+            "README.md",
+        }
+
+        self.SUPPORTED_EXTENSIONS = {".md", ".txt"}
+
+    def read_files(self) -> List[Path]:
+        return self._walk_directory(self.root_dir)
+
+    def _walk_directory(self, directory: Path):
+        """Walk through directory while respecting ignored paths"""
+        for item in directory.rglob("*"):
+            if not any(ignored in item.parts for ignored in self.IGNORED_PATHS):
+                if item.is_file() and item.suffix in self.SUPPORTED_EXTENSIONS:
+                    yield item
+
+
+class Normalizer:
+    def __init__(self):
+        self.lower_case = True
+        self.remove_punctuation = False
+        self.remove_stopwords = True
+
+        self.stop_words = set(nltk.corpus.stopwords.words("english"))
+
+    def normalize_doc(self, doc: str) -> str:
+        if self.lower_case:
+            doc = doc.lower()
+
+        if self.remove_punctuation:
+            doc = str.translate(doc, str.maketrans("", "", string.punctuation))
+
+        if self.remove_stopwords:
+            doc = " ".join(
+                [word for word in doc.split() if word not in self.stop_words]
+            )
+
+        return doc
+
+
+def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:
+    reader = DataReader()
+    normalizer = Normalizer()
+
+    keyword_map = {}
+
+    for i, file in enumerate(files):
+        with open(file) as f:
+            doc = normalizer.normalize_doc(f.read())
+
+            keywords = model.extract_keywords(
+                doc,
+                keyphrase_ngram_range=(1, 1),
+                stop_words="english",
+                use_mmr=True,
+                diversity=0.3,
+            )
+
+            keyword_map["/".join(file.parts)] = keywords
+
+    return keyword_map
+
+
+def aggregate(file_to_keywords: dict) -> dict:
+    keyword_to_files = {}
+    for file, keywords in file_to_keywords.items():
+        for keyword in keywords:
+            if keyword[0] not in keyword_to_files:
+                keyword_to_files[keyword[0]] = []
+            keyword_to_files[keyword[0]].append(file)
+    return keyword_to_files
+
+
+def write_idx_to_json_file(idx: dict, output_path: Path):
+    if output_path.exists():
+        output_path.unlink()
+
+    if not output_path.parent.exists():
+        output_path.parent.mkdir(parents=True)
+
+    with open(output_path, "w") as f:
+        json.dump(idx, f)
+
+
+if __name__ == "__main__":
+    reader = DataReader()
+    files = reader.read_files()
+    model = KeyBERT()
+
+    file_to_kw = get_kw_path_map(files, model)
+    kw_to_file = aggregate(file_to_kw)
+    write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
+    write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))