diff --git a/natural-language-processing/tf-idf.py b/natural-language-processing/tf-idf.py new file mode 100644 index 0000000..b1574be --- /dev/null +++ b/natural-language-processing/tf-idf.py @@ -0,0 +1,128 @@ +import numpy as np +import os +from typing import Dict, List, Tuple + + +def read_markdown_files(root_dir: str, max_depth: int = 8) -> Dict[str, str]: + + corpus = {} + + def process_dir(path: str, depth: int = 0): + if depth > max_depth: + return + + try: + for entry in os.listdir(path): + full_path = os.path.join(path, entry) + if os.path.isdir(full_path): + process_dir(full_path, depth + 1) + elif entry.endswith(".md"): + try: + with open(full_path, "r") as f: + corpus[full_path] = f.read() + except Exception: + continue + except Exception: + return + + process_dir(root_dir) + return corpus + + +def create_index(corpus: Dict[str, str]) -> Tuple[Dict[str, Dict[str, int]], List[str]]: + + inv_idx = {} + for doc_name, content in corpus.items(): + words = content.split() + for word in words: + if word not in inv_idx: + inv_idx[word] = {} + inv_idx[word][doc_name] = inv_idx[word].get(doc_name, 0) + 1 + + return inv_idx, list(inv_idx.keys()) + + +def calculate_tfidf( + corpus: Dict[str, str], inv_idx: Dict[str, Dict[str, int]], word_list: List[str] +) -> Dict[str, np.ndarray]: + + N = len(corpus) + tfidf = {} + + for doc_name, content in corpus.items(): + doc_words = content.split() + doc_len = len(doc_words) + tfidf[doc_name] = np.zeros(len(word_list)) + + for i, word in enumerate(word_list): + if word in inv_idx and doc_name in inv_idx[word]: + tf = inv_idx[word][doc_name] / doc_len + idf = np.log(N / len(inv_idx[word])) + tfidf[doc_name][i] = tf * idf + + return tfidf + + +def search( + query: str, + tfidf: Dict[str, np.ndarray], + word_list: List[str], + inv_idx: Dict[str, Dict[str, int]], + num_results: int = 5, +) -> List[Tuple[str, float]]: + + N = len(tfidf) + query_vec = np.zeros(len(word_list)) + + for i, word in enumerate(word_list): + if word in query.split() and word in inv_idx: + query_vec[i] = np.log(N / len(inv_idx[word])) + + similarities = [ + (doc_name, float(np.dot(doc_vec, query_vec))) + for doc_name, doc_vec in tfidf.items() + ] + + return sorted(similarities, key=lambda x: x[1], reverse=True)[:num_results] + + +def main(): + + try: + root_dir = ( + input("Enter root directory to search (default '.'): ").strip() or "." + ) + print(f"Reading markdown files from {root_dir}...") + + corpus = read_markdown_files(root_dir) + if not corpus: + print("No markdown files found!") + return + + print(f"Found {len(corpus)} markdown files") + print("Building search index...") + + inv_idx, word_list = create_index(corpus) + tfidf = calculate_tfidf(corpus, inv_idx, word_list) + + print("Search system ready!") + + while True: + query = input('\nEnter search query (or "exit" to quit): ').strip() + if query.lower() == "exit": + break + + results = search(query, tfidf, word_list, inv_idx) + + print("\nSearch results:") + for doc_name, similarity in results: + print(f"{doc_name}: {similarity:.4f}") + + except KeyboardInterrupt: + print("\nExiting...") + except Exception as e: + print(f"\nError: {str(e)}") + + +if __name__ == "__main__": + main()