Skip to content

Commit

Permalink
sync changes:
Browse files Browse the repository at this point in the history
- A  idx/file_to_keywords.json

- A  idx/keyword_to_files.json

- M  scripts/build.py
  • Loading branch information
elimelt committed Jan 13, 2025
1 parent 57a22de commit 19d406c
Show file tree
Hide file tree
Showing 11 changed files with 443 additions and 6 deletions.
1 change: 1 addition & 0 deletions idx/file_to_keywords.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions idx/keyword_to_files.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class SiteGenerator:
"__pycache__",
"node_modules",
".github",
"nlp.venv",
"site",
"venv",
".venv",
}
Expand Down
116 changes: 116 additions & 0 deletions scripts/keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import json
import string
import nltk

from typing import List
from pathlib import Path
from keybert import KeyBERT

nltk.download("stopwords")


class DataReader:
def __init__(self, root_dir: str = "."):
self.root_dir = Path(root_dir)
self.ROOT = root_dir
self.IGNORED_PATHS = {
".git",
"__pycache__",
"node_modules",
".github",
"venv",
".venv",
"nlp.venv",
"site",
"README.md",
}

self.SUPPORTED_EXTENSIONS = {".md", ".txt"}

def read_files(self) -> List[Path]:
return self._walk_directory(self.root_dir)

def _walk_directory(self, directory: Path):
"""Walk through directory while respecting ignored paths"""
for item in directory.rglob("*"):
if not any(ignored in item.parts for ignored in self.IGNORED_PATHS):
if item.is_file() and item.suffix in self.SUPPORTED_EXTENSIONS:
yield item


class Normalizer:
def __init__(self):
self.lower_case = True
self.remove_punctuation = False
self.remove_stopwords = True

self.stop_words = set(nltk.corpus.stopwords.words("english"))

def normalize_doc(self, doc: str) -> str:
if self.lower_case:
doc = doc.lower()

if self.remove_punctuation:
doc = str.translate(doc, str.maketrans("", "", string.punctuation))

if self.remove_stopwords:
doc = " ".join(
[word for word in doc.split() if word not in self.stop_words]
)

return doc


def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:
reader = DataReader()
normalizer = Normalizer()

keyword_map = {}

for i, file in enumerate(files):
with open(file) as f:
doc = normalizer.normalize_doc(f.read())

keywords = model.extract_keywords(
doc,
keyphrase_ngram_range=(1, 1),
stop_words="english",
use_mmr=True,
diversity=0.3,
)

keyword_map["/".join(file.parts)] = keywords

return keyword_map


def aggregate(file_to_keywords: dict) -> dict:
keyword_to_files = {}
for file, keywords in file_to_keywords.items():
for keyword in keywords:
if keyword[0] not in keyword_to_files:
keyword_to_files[keyword[0]] = []
keyword_to_files[keyword[0]].append(file)
return keyword_to_files


def write_idx_to_json_file(idx: dict, output_path: Path):
if output_path.exists():
output_path.unlink()

if not output_path.parent.exists():
output_path.parent.mkdir(parents=True)

with open(output_path, "w") as f:
json.dump(idx, f)


if __name__ == "__main__":
reader = DataReader()
files = reader.read_files()
model = KeyBERT()

file_to_kw = get_kw_path_map(files, model)
kw_to_file = aggregate(file_to_kw)
write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))
1 change: 1 addition & 0 deletions site/idx/file_to_keywords.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions site/idx/keyword_to_files.json

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions site/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ <h1></h1>

<div class="landing-stats">
<div class="stat-item">
<span class="stat-value">151</span>
<span class="stat-value">152</span>
<span class="stat-label">Notes</span>
</div>
<div class="stat-item">
Expand All @@ -205,6 +205,11 @@ <h1></h1>
<div class="recent-section">
<h2>Recent</h2>
<ul class='recent-posts'>
<li>
<a href="/test.html">Test</a>
<span class="date">2025-01-12</span>

</li>
<li>
<a href="/systems-research/strong-inference.html">Strong Inference</a>
<span class="date">2025-01-12</span>
Expand Down Expand Up @@ -249,11 +254,6 @@ <h2>Recent</h2>
<a href="/designing-data-intensive-applications/part-1-foundations-of-data-systems/ch4-encoding-and-evolution.html">Encoding, Evolution, and Data Flow in Distributed Systems</a>
<span class="date">2025-01-01</span>
<span class="category">Distributed Systems</span>
</li>
<li>
<a href="/designing-data-intensive-applications/part-1-foundations-of-data-systems/ch3-storage-and-retrieval.html">Storage and Retrieval Techniques for Database Systems</a>
<span class="date">2025-01-01</span>
<span class="category">Database Systems</span>
</li></ul>
</div>
<div class="categories-section">
Expand Down
2 changes: 2 additions & 0 deletions site/scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class SiteGenerator:
"__pycache__",
"node_modules",
".github",
"nlp.venv",
"site",
"venv",
".venv",
}
Expand Down
116 changes: 116 additions & 0 deletions site/scripts/keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import json
import string
import nltk

from typing import List
from pathlib import Path
from keybert import KeyBERT

nltk.download("stopwords")


class DataReader:
def __init__(self, root_dir: str = "."):
self.root_dir = Path(root_dir)
self.ROOT = root_dir
self.IGNORED_PATHS = {
".git",
"__pycache__",
"node_modules",
".github",
"venv",
".venv",
"nlp.venv",
"site",
"README.md",
}

self.SUPPORTED_EXTENSIONS = {".md", ".txt"}

def read_files(self) -> List[Path]:
return self._walk_directory(self.root_dir)

def _walk_directory(self, directory: Path):
"""Walk through directory while respecting ignored paths"""
for item in directory.rglob("*"):
if not any(ignored in item.parts for ignored in self.IGNORED_PATHS):
if item.is_file() and item.suffix in self.SUPPORTED_EXTENSIONS:
yield item


class Normalizer:
def __init__(self):
self.lower_case = True
self.remove_punctuation = False
self.remove_stopwords = True

self.stop_words = set(nltk.corpus.stopwords.words("english"))

def normalize_doc(self, doc: str) -> str:
if self.lower_case:
doc = doc.lower()

if self.remove_punctuation:
doc = str.translate(doc, str.maketrans("", "", string.punctuation))

if self.remove_stopwords:
doc = " ".join(
[word for word in doc.split() if word not in self.stop_words]
)

return doc


def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:
reader = DataReader()
normalizer = Normalizer()

keyword_map = {}

for i, file in enumerate(files):
with open(file) as f:
doc = normalizer.normalize_doc(f.read())

keywords = model.extract_keywords(
doc,
keyphrase_ngram_range=(1, 1),
stop_words="english",
use_mmr=True,
diversity=0.3,
)

keyword_map["/".join(file.parts)] = keywords

return keyword_map


def aggregate(file_to_keywords: dict) -> dict:
keyword_to_files = {}
for file, keywords in file_to_keywords.items():
for keyword in keywords:
if keyword[0] not in keyword_to_files:
keyword_to_files[keyword[0]] = []
keyword_to_files[keyword[0]].append(file)
return keyword_to_files


def write_idx_to_json_file(idx: dict, output_path: Path):
if output_path.exists():
output_path.unlink()

if not output_path.parent.exists():
output_path.parent.mkdir(parents=True)

with open(output_path, "w") as f:
json.dump(idx, f)


if __name__ == "__main__":
reader = DataReader()
files = reader.read_files()
model = KeyBERT()

file_to_kw = get_kw_path_map(files, model)
kw_to_file = aggregate(file_to_kw)
write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))
Loading

0 comments on commit 19d406c

Please sign in to comment.