Skip to content

Commit

Permalink
sync changes:
Browse files Browse the repository at this point in the history
- M  idx/file_to_keywords.json

- M  idx/keyword_to_files.json

- M  scripts/keywords.py
  • Loading branch information
elimelt committed Jan 13, 2025
1 parent 19d406c commit 4df94fb
Show file tree
Hide file tree
Showing 113 changed files with 149 additions and 329 deletions.
2 changes: 1 addition & 1 deletion idx/file_to_keywords.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion idx/keyword_to_files.json

Large diffs are not rendered by default.

20 changes: 18 additions & 2 deletions scripts/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import string
import nltk

from nltk.stem import WordNetLemmatizer
from typing import List
from pathlib import Path
from keybert import KeyBERT

nltk.download("stopwords")

nltk.download('wordnet')

class DataReader:
def __init__(self, root_dir: str = "."):
Expand Down Expand Up @@ -83,6 +84,21 @@ def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:

return keyword_map

def deduplicate(file_to_keywords: dict) -> dict:
wnl = WordNetLemmatizer()
deduped = {}
dedup_count = 0
for f, kws in file_to_keywords.items():
deduped[f] = []
for kw, acc in kws:
stem = wnl.lemmatize(kw)
if stem != kw:
print('stem', stem, 'kw', kw)
dedup_count += 1
if stem not in deduped[f]:
deduped[f].append((stem, acc))
print('dedup count', dedup_count)
return deduped

def aggregate(file_to_keywords: dict) -> dict:
keyword_to_files = {}
Expand Down Expand Up @@ -110,7 +126,7 @@ def write_idx_to_json_file(idx: dict, output_path: Path):
files = reader.read_files()
model = KeyBERT()

file_to_kw = get_kw_path_map(files, model)
file_to_kw = deduplicate(get_kw_path_map(files, model))
kw_to_file = aggregate(file_to_kw)
write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))
15 changes: 0 additions & 15 deletions scripts/simple.md

This file was deleted.

2 changes: 1 addition & 1 deletion site/categories/algorithm analysis.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Algorithm Analysis</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/algorithms.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: algorithms</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/computer science.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Computer Science</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/database design.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Database Design</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/database systems.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Database Systems</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/distributed systems.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Distributed Systems</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/graph theory.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Graph Theory</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@
</div>
<h1>Categories</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/mathematics.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Mathematics</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/operations research.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Operations Research</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/research.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: research</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/software engineering.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: Software Engineering</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/categories/system-design.html
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
</div>
<h1>Category: system-design</h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">
Expand Down
2 changes: 1 addition & 1 deletion site/idx/file_to_keywords.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion site/idx/keyword_to_files.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions site/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,14 @@
</div>
<h1></h1>
<div class="meta">
<span>Last modified: 2025-01-12</span>
<span>Last modified: 2025-01-13</span>

</div>
<div class="content">

<div class="landing-stats">
<div class="stat-item">
<span class="stat-value">152</span>
<span class="stat-value">151</span>
<span class="stat-label">Notes</span>
</div>
<div class="stat-item">
Expand Down
20 changes: 18 additions & 2 deletions site/scripts/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import string
import nltk

from nltk.stem import WordNetLemmatizer
from typing import List
from pathlib import Path
from keybert import KeyBERT

nltk.download("stopwords")

nltk.download('wordnet')

class DataReader:
def __init__(self, root_dir: str = "."):
Expand Down Expand Up @@ -83,6 +84,21 @@ def get_kw_path_map(files: List[Path], model: KeyBERT) -> dict:

return keyword_map

def deduplicate(file_to_keywords: dict) -> dict:
wnl = WordNetLemmatizer()
deduped = {}
dedup_count = 0
for f, kws in file_to_keywords.items():
deduped[f] = []
for kw, acc in kws:
stem = wnl.lemmatize(kw)
if stem != kw:
print('stem', stem, 'kw', kw)
dedup_count += 1
if stem not in deduped[f]:
deduped[f].append((stem, acc))
print('dedup count', dedup_count)
return deduped

def aggregate(file_to_keywords: dict) -> dict:
keyword_to_files = {}
Expand Down Expand Up @@ -110,7 +126,7 @@ def write_idx_to_json_file(idx: dict, output_path: Path):
files = reader.read_files()
model = KeyBERT()

file_to_kw = get_kw_path_map(files, model)
file_to_kw = deduplicate(get_kw_path_map(files, model))
kw_to_file = aggregate(file_to_kw)
write_idx_to_json_file(file_to_kw, Path("idx/file_to_keywords.json"))
write_idx_to_json_file(kw_to_file, Path("idx/keyword_to_files.json"))
Loading

0 comments on commit 4df94fb

Please sign in to comment.