Book recommender (#127)

* Book recommender * imports * imports * restructure * readme * remove genres * fixes * docstrings * generation script * assets * readme * readme * minor fixes * minor * remove data files * lock * fixed command * readme
ploomber · Mar 1, 2024 · 5afc945 · 5afc945
1 parent 176ad67
commit 5afc945
Show file tree

Hide file tree

Showing 6 changed files with 438 additions and 0 deletions.
diff --git a/examples/panel/book-recommender/README.md b/examples/panel/book-recommender/README.md
@@ -0,0 +1,37 @@
+# Book Recommender
+
+A chat assistant for recommending books to the user based on inputs.
+
+## Set key
+
+To run this example you need to set the `OPENAI_API_KEY` environment variable.
+
+```bash
+export OPENAI_API_KEY=<your_api_key> 
+```
+
+## Dataset
+
+Download the [dataset](https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023) to the `book-recommender/` folder, and rename it as `goodreads.csv`. 
+
+
+## Generate lookup files
+
+Generate lookup files by running the following script:
+
+```bash
+python generate_assets.py --embeddings --verbose
+```
+
+Running this command should generate `author_to_title.json`, `title_to_description.json` and `embeddings.json` files in the `assets/` folder.
+
+If you want to generate embedding for the first N rows only run the below command (here N=100):
+
+```bash
+python generate_assets.py -n 100 --embeddings --verbose
+```
+
+## Deployment
+
+Create a zip of `app.py`, `util.py`, `requirements.txt` and `assets/` folder, and follow the instructions for deploying a [Panel](https://docs.cloud.ploomber.io/en/latest/apps/panel.html) application.
+You also need to set `OPENAI_API_KEY` as an [environment variable](https://docs.cloud.ploomber.io/en/latest/user-guide/env-vars.html) while deploying the application.
diff --git a/examples/panel/book-recommender/app.py b/examples/panel/book-recommender/app.py
@@ -0,0 +1,164 @@
+"""
+Chat application for recommending books to the user.
+
+Input:
+Users can submit queries that describe the type of books they are looking for, e.g., suggest fiction novels.
+Users can also ask the chat assistant for books by specific author, e.g., recommend books by Dan Brown.
+Answers to user's queries will be based on the Goodreads dataset:
+https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023
+
+Application logic:
+The app determines the closest matches by comparing user query's embedding to the available
+book embeddings. Embeddings of books are pre-computed on the description column of every book
+and stored in the assets/ folder.
+
+Response:
+The chat assistant then determines the top relevant answers shortlisted by comparing embeddings and
+provides the top 5 recommendations.
+"""
+
+import json
+import panel as pn
+from openai import OpenAI
+from scipy.spatial import KDTree
+import numpy as np
+from pathlib import Path
+
+from util import get_embedding_from_text
+
+with open(Path("assets", "title_to_description.json"), 'r') as file:
+    DESCRIPTIONS = json.load(file)
+
+with open(Path("assets", "author_to_title.json"), 'r') as file:
+    AUTHOR_TITLES = json.load(file)
+
+
+def load_embeddings_file():
+    """Load the pre-computed embeddings of description column
+    The data is in the format title: embedding
+    """
+    file_path = Path('assets', 'embeddings.json')
+    with open(file_path, "r", encoding="utf-8") as file:
+        embeddings_json = json.load(file)
+    return embeddings_json
+
+client = OpenAI()
+
+pn.extension()
+
+
+def get_book_description_by_title(title):
+    """Return description of a book"""
+    return DESCRIPTIONS[title.upper()]
+
+
+def detect_author(user_query):
+    system_prompt = f"""
+    You're a system that determines the author in user query. 
+    
+    You need to return only the author name.Please fix any typo if possible
+"""
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": "What are some books by Sandra Boynton"},
+            {"role": "system", "content": "Sandra Boynton"},
+            {"role": "user", "content": user_query},
+        ],
+        seed=42,
+        n=1,
+    )
+    author = response.choices[0].message.content.upper()
+    return author if author in AUTHOR_TITLES else ""
+
+
+def book_recommender_agent(user_query, verbose=False):
+    """An agent that can recommend books to the user based on input"""
+    embeddings_json = load_embeddings_file()
+
+    # If author is mentioned, filter books written by the author.
+    # Otherwise, consider all the available books.
+    author = detect_author(user_query)
+    titles = []
+    if author:
+        titles = AUTHOR_TITLES[author]
+        if verbose:
+            print(f"Found these titles: {titles} by author: {author}")
+
+    filtered_embeddings_by_title = {}
+    for title in titles:
+        title_embedding = embeddings_json.get(title, None)
+        if title_embedding:
+            filtered_embeddings_by_title[title] = title_embedding
+    if filtered_embeddings_by_title:
+        embeddings_json = filtered_embeddings_by_title
+
+    titles = []
+    embeddings = []
+    for key, value in embeddings_json.items():
+        if value:
+            titles.append(key)
+            embeddings.append(value)
+    kdtree = KDTree(np.array(embeddings))
+    _, indexes = kdtree.query(get_embedding_from_text(user_query), k=min(len(titles), 5))
+
+    if isinstance(indexes, np.int64):
+        indexes = [indexes]
+    titles_relevant = [titles[i] for i in indexes if titles[i] != "null"]
+    if verbose:
+        print(f"Found these relevant titles: {titles_relevant}")
+    descriptions_relevant = [get_book_description_by_title(title) for title in titles_relevant]
+
+    recommendation_text = ""
+    for i, value in enumerate(titles_relevant):
+        recommendation_text = f"{recommendation_text}{value}: {descriptions_relevant[i]}\n\n##"
+
+    system_prompt = f"""
+You are a helpful book recommendation system that can recommend users books based on their inputs.
+
+Here are the top relevant titles and descriptions (separated by ##) in the format titles: descriptions, 
+use these to generate your answer,
+and disregard books that are not relevant to user's input. You can display 5 or less recommendations.:
+
+{recommendation_text}
+
+You should also create a summary of the description and format the answer properly. 
+You can display a maximum of 5 recommendations.
+Please do not suggest any books outside this list.
+"""
+
+    if verbose:
+        print(f"System prompt: {system_prompt}")
+
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_query},
+        ],
+        seed=42,
+        n=1,
+    )
+
+    return response.choices[0].message.content
+
+
+def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
+    return book_recommender_agent(contents)
+
+
+chat_interface = pn.chat.ChatInterface(callback=callback)
+chat_interface.send(
+    "I am a book recommendation engine! "
+    "You may ask questions like: \n* Recommend books by Dan Brown.\n"
+    "* Suggest some books based in the Victorian era.\n\n"
+    "You can deploy your own by signing up at https://ploomber.io",
+    user="System",
+    respond=False,
+)
+
+pn.template.MaterialTemplate(
+    title="Book Recommender",
+    main=[chat_interface],
+).servable()
diff --git a/examples/panel/book-recommender/generate_assets.py b/examples/panel/book-recommender/generate_assets.py
@@ -0,0 +1,66 @@
+import json
+import argparse
+import pandas as pd
+from pathlib import Path
+from openai import OpenAI
+
+from util import get_embedding_from_text
+
+# Download the dataset from:
+# https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023
+df = pd.read_csv(Path("goodreads.csv"))
+
+client = OpenAI()
+
+
+def generate_embeddings(rows, verbose=False):
+    """Function to generate an embeddings.json file that contains
+    mapping {title: embedding} where embeddings are generated on the
+    description column of goodreads.csv file.
+    """
+
+    final_df = df.head(rows) if rows else df
+    embeddings_json = {}
+    if verbose and rows:
+        print(f"Generating embeddings for {rows} rows")
+    for index, row in final_df.iterrows():
+        if verbose:
+            print(f"Row number: {index}")
+        embeddings_json[row["title"]] = get_embedding_from_text(row["description"])
+    path = Path("assets", "embeddings.json")
+    with open(path, 'w') as f:
+        json.dump(embeddings_json, f)
+    if verbose:
+        print(f"Generated embeddings of description column at '{path}'")
+
+
+def generate_lookup(verbose=False):
+    """Function to generate mappings between columns for faster lookup"""
+    author_to_title = df.groupby(df['authors'].str.upper())['title'].apply(list).to_dict()
+    path = Path("assets", "author_to_title.json")
+    with open(path, 'w') as f:
+        json.dump(author_to_title, f)
+    if verbose:
+        print(f"Generated author to title mappings at '{path}'")
+    title_to_description = df.groupby(df['title'].str.upper())['description'].apply(list).to_dict()
+    path = Path("assets", "title_to_description.json")
+    with open(path, 'w') as f:
+        json.dump(title_to_description, f)
+    if verbose:
+        print(f"Generated title to description mappings at '{path}'")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--verbose', action='store_true', help='Debug')
+    parser.add_argument('--embeddings', action='store_true', help='Generate embeddings')
+    parser.add_argument("-n", "--rows", required=False, help="Number of rows to limit for generating embeddings")
+    args = parser.parse_args()
+    assets_path = Path("assets")
+    if not assets_path.exists():
+        Path("assets").mkdir()
+    generate_lookup(args.verbose)
+    rows = int(args.rows) if args.rows else None
+    if args.embeddings:
+        generate_embeddings(rows, args.verbose)
+