Skip to content

Commit

Permalink
before the great tagging
Browse files Browse the repository at this point in the history
  • Loading branch information
elimelt committed Feb 11, 2025
1 parent c875a17 commit 645a1fc
Show file tree
Hide file tree
Showing 27 changed files with 417 additions and 140 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
*~
.vscode
.DS_Store
venv
venv
*.pyc
6 changes: 1 addition & 5 deletions algorithms/DAGs.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
---
title: Topological Ordering and Properties of Directed Acyclic Graphs
category: algorithms
tags:
- graph theory
- topological sorting
- directed acyclic graphs
- proofs
tags: graph theory, topological sorting, directed acyclic graphs, proofs
description: A technical exploration of Directed Acyclic Graphs (DAGs) focusing on their topological ordering properties and fundamental lemmas. The document includes mathematical proofs of key DAG properties and presents a Python implementation of the topological sorting algorithm.
---

Expand Down
6 changes: 1 addition & 5 deletions algorithms/divide-and-conquer.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
---
title: Divide and Conquer Algorithm Analysis with Implementation Examples
category: algorithms
tags:
- divide-and-conquer
- algorithmic-complexity
- recursive-algorithms
- computational-geometry
tags: divide-and-conquer, algorithmic-complexity, recursive-algorithms, computational-geometry
description: A comprehensive examination of divide and conquer algorithmic strategies, focusing on their implementation and analysis. The document covers theoretical foundations with mathematical proofs, practical examples including bisection method and closest pair problem, and includes Python implementations demonstrating these concepts.
---

Expand Down
2 changes: 1 addition & 1 deletion algorithms/greedy-algorithms.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
title: Greedy Algorithms for Interval Scheduling and Partitioning
category: Algorithm Analysis
tags: algorithms, interval, scheduling, partitioning, greedy-algorithms
tags: algorithms, interval, scheduling, partitioning, greedy
description: This document explores greedy algorithms for interval scheduling and partitioning problems. It provides detailed explanations of the algorithms, including Python implementations, and presents rigorous proofs of correctness using techniques such as "Greedy Stays Ahead" and exchange arguments.
---

Expand Down
6 changes: 6 additions & 0 deletions distributed-systems/dynamo-db.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
---
title: Dynamo: Amazon's Highly Available Key-value Store
category: database-systems
tags: key-value store, database-design, high-availability, consistency, object-versioning, conflict-resolution
description: A highly available key-value storage system sacrificing consistency under failure conditions, using object versioning and application assisted conflict resolution.
---
# Dynamo: Amazon's Highly Available Key-value Store

[reading](https://dl-acm-org.offcampus.lib.washington.edu/doi/pdf/10.1145/1323293.1294281)
Expand Down
Empty file.
278 changes: 278 additions & 0 deletions natural-language-processing/ppmi.ipynb

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
128 changes: 0 additions & 128 deletions natural-language-processing/tf-idf.py
Original file line number Diff line number Diff line change
@@ -1,128 +0,0 @@
import numpy as np
import os
from typing import Dict, List, Tuple


def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:

corpus = {}

def process_dir(path: str, depth: int = 0):
if depth > max_depth:
return

try:
for entry in os.listdir(path):
full_path = os.path.join(path, entry)
if os.path.isdir(full_path):
process_dir(full_path, depth + 1)
elif entry.endswith(".md"):
try:
with open(full_path, "r") as f:
corpus[full_path] = f.read()
except Exception:
continue
except Exception:
return

process_dir(root_dir)
return corpus


def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:

inv_idx = {}
for doc_name, content in corpus.items():
words = content.split()
for word in words:
if word not in inv_idx:
inv_idx[word] = {}
inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1

return inv_idx, list(inv_idx.keys())


def calculate_tfidf(
corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
) -> Dict[str, np.ndarray]:

N = len(corpus)
tfidf = {}

for doc_name, content in corpus.items():
doc_words = content.split()
doc_len = len(doc_words)
tfidf[doc_name] = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in inv_idx and doc_name in inv_idx[word]:
tf = inv_idx[word][doc_name] / doc_len
idf = np.log(N / len(inv_idx[word]))
tfidf[doc_name][i] = tf * idf

return tfidf


def search(
query: str,
tfidf: Dict[str, np.ndarray],
word_list: List[str],
inv_idx: Dict[str, Dict[str, int]],
num_results: int = 5,
) -> List[Tuple[str, float]]:

N = len(tfidf)
query_vec = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in query.split() and word in inv_idx:
query_vec[i] = np.log(N / len(inv_idx[word]))

similarities = [
(doc_name, float(np.dot(doc_vec, query_vec)))
for doc_name, doc_vec in tfidf.items()
]

return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]


def main():

try:
root_dir = (
input("Enter root directory to search (default '.'): ").strip() or "."
)
print(f"Reading markdown files from {root_dir}...")

corpus = read_markdown_files(root_dir)
if not corpus:
print("No markdown files found!")
return

print(f"Found {len(corpus)} markdown files")
print("Building search index...")

inv_idx, word_list = create_index(corpus)
tfidf = calculate_tfidf(corpus, inv_idx, word_list)

print("Search system ready!")

while True:
query = input('\nEnter search query (or "exit" to quit): ').strip()
if query.lower() == "exit":
break

results = search(query, tfidf, word_list, inv_idx)

print("\nSearch results:")
for doc_name, similarity in results:
print(f"{doc_name}: {similarity:.4f}")

except KeyboardInterrupt:
print("\nExiting...")
except Exception as e:
print(f"\nError: {str(e)}")


if __name__ == "__main__":
main()
128 changes: 128 additions & 0 deletions natural-language-processing/tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import os
from typing import Dict, List, Tuple


def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]:

corpus = {}

def process_dir(path: str, depth: int = 0):
if depth > max_depth:
return

try:
for entry in os.listdir(path):
full_path = os.path.join(path, entry)
if os.path.isdir(full_path):
process_dir(full_path, depth + 1)
elif entry.endswith(".md"):
try:
with open(full_path, "r") as f:
corpus[full_path] = f.read()
except Exception:
continue
except Exception:
return

process_dir(root_dir)
return corpus


def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]:

inv_idx = {}
for doc_name, content in corpus.items():
words = content.split()
for word in words:
if word not in inv_idx:
inv_idx[word] = {}
inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1

return inv_idx, list(inv_idx.keys())


def calculate_tfidf(
corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str]
) -> Dict[str, np.ndarray]:

N = len(corpus)
tfidf = {}

for doc_name, content in corpus.items():
doc_words = content.split()
doc_len = len(doc_words)
tfidf[doc_name] = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in inv_idx and doc_name in inv_idx[word]:
tf = inv_idx[word][doc_name] / doc_len
idf = np.log(N / len(inv_idx[word]))
tfidf[doc_name][i] = tf * idf

return tfidf


def search(
query: str,
tfidf: Dict[str, np.ndarray],
word_list: List[str],
inv_idx: Dict[str, Dict[str, int]],
num_results: int = 5,
) -> List[Tuple[str, float]]:

N = len(tfidf)
query_vec = np.zeros(len(word_list))

for i, word in enumerate(word_list):
if word in query.split() and word in inv_idx:
query_vec[i] = np.log(N / len(inv_idx[word]))

similarities = [
(doc_name, float(np.dot(doc_vec, query_vec)))
for doc_name, doc_vec in tfidf.items()
]

return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results]


def main():

try:
root_dir = (
input("Enter root directory to search (default '.'): ").strip() or "."
)
print(f"Reading markdown files from {root_dir}...")

corpus = read_markdown_files(root_dir)
if not corpus:
print("No markdown files found!")
return

print(f"Found {len(corpus)} markdown files")
print("Building search index...")

inv_idx, word_list = create_index(corpus)
tfidf = calculate_tfidf(corpus, inv_idx, word_list)

print("Search system ready!")

while True:
query = input('\nEnter search query (or "exit" to quit): ').strip()
if query.lower() == "exit":
break

results = search(query, tfidf, word_list, inv_idx)

print("\nSearch results:")
for doc_name, similarity in results:
print(f"{doc_name}: {similarity:.4f}")

except KeyboardInterrupt:
print("\nExiting...")
except Exception as e:
print(f"\nError: {str(e)}")


if __name__ == "__main__":
main()
Binary file not shown.
Binary file added papers/gfs.pdf
Binary file not shown.
Binary file added papers/google-cluster-architecture.pdf
Binary file not shown.
Binary file added papers/lessons-from-giant-scale-services.pdf
Binary file not shown.
Binary file added papers/mapreduce-SDPoLC.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added systems-research/papers/amorph-os.pdf
Binary file not shown.
Binary file added systems-research/papers/barrelfish.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 645a1fc

Please sign in to comment.