diff --git a/examples/panel/book-recommender/README.md b/examples/panel/book-recommender/README.md new file mode 100644 index 00000000..a9249e53 --- /dev/null +++ b/examples/panel/book-recommender/README.md @@ -0,0 +1,37 @@ +# Book Recommender + +A chat assistant for recommending books to the user based on inputs. + +## Set key + +To run this example you need to set the `OPENAI_API_KEY` environment variable. + +```bash +export OPENAI_API_KEY= +``` + +## Dataset + +Download the [dataset](https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023) to the `book-recommender/` folder, and rename it as `goodreads.csv`. + + +## Generate lookup files + +Generate lookup files by running the following script: + +```bash +python generate_assets.py --embeddings --verbose +``` + +Running this command should generate `author_to_title.json`, `title_to_description.json` and `embeddings.json` files in the `assets/` folder. + +If you want to generate embedding for the first N rows only run the below command (here N=100): + +```bash +python generate_assets.py -n 100 --embeddings --verbose +``` + +## Deployment + +Create a zip of `app.py`, `util.py`, `requirements.txt` and `assets/` folder, and follow the instructions for deploying a [Panel](https://docs.cloud.ploomber.io/en/latest/apps/panel.html) application. +You also need to set `OPENAI_API_KEY` as an [environment variable](https://docs.cloud.ploomber.io/en/latest/user-guide/env-vars.html) while deploying the application. diff --git a/examples/panel/book-recommender/app.py b/examples/panel/book-recommender/app.py new file mode 100644 index 00000000..59580dc3 --- /dev/null +++ b/examples/panel/book-recommender/app.py @@ -0,0 +1,164 @@ +""" +Chat application for recommending books to the user. + +Input: +Users can submit queries that describe the type of books they are looking for, e.g., suggest fiction novels. +Users can also ask the chat assistant for books by specific author, e.g., recommend books by Dan Brown. +Answers to user's queries will be based on the Goodreads dataset: +https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023 + +Application logic: +The app determines the closest matches by comparing user query's embedding to the available +book embeddings. Embeddings of books are pre-computed on the description column of every book +and stored in the assets/ folder. + +Response: +The chat assistant then determines the top relevant answers shortlisted by comparing embeddings and +provides the top 5 recommendations. +""" + +import json +import panel as pn +from openai import OpenAI +from scipy.spatial import KDTree +import numpy as np +from pathlib import Path + +from util import get_embedding_from_text + +with open(Path("assets", "title_to_description.json"), 'r') as file: + DESCRIPTIONS = json.load(file) + +with open(Path("assets", "author_to_title.json"), 'r') as file: + AUTHOR_TITLES = json.load(file) + + +def load_embeddings_file(): + """Load the pre-computed embeddings of description column + The data is in the format title: embedding + """ + file_path = Path('assets', 'embeddings.json') + with open(file_path, "r", encoding="utf-8") as file: + embeddings_json = json.load(file) + return embeddings_json + +client = OpenAI() + +pn.extension() + + +def get_book_description_by_title(title): + """Return description of a book""" + return DESCRIPTIONS[title.upper()] + + +def detect_author(user_query): + system_prompt = f""" + You're a system that determines the author in user query. + + You need to return only the author name.Please fix any typo if possible +""" + response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": "What are some books by Sandra Boynton"}, + {"role": "system", "content": "Sandra Boynton"}, + {"role": "user", "content": user_query}, + ], + seed=42, + n=1, + ) + author = response.choices[0].message.content.upper() + return author if author in AUTHOR_TITLES else "" + + +def book_recommender_agent(user_query, verbose=False): + """An agent that can recommend books to the user based on input""" + embeddings_json = load_embeddings_file() + + # If author is mentioned, filter books written by the author. + # Otherwise, consider all the available books. + author = detect_author(user_query) + titles = [] + if author: + titles = AUTHOR_TITLES[author] + if verbose: + print(f"Found these titles: {titles} by author: {author}") + + filtered_embeddings_by_title = {} + for title in titles: + title_embedding = embeddings_json.get(title, None) + if title_embedding: + filtered_embeddings_by_title[title] = title_embedding + if filtered_embeddings_by_title: + embeddings_json = filtered_embeddings_by_title + + titles = [] + embeddings = [] + for key, value in embeddings_json.items(): + if value: + titles.append(key) + embeddings.append(value) + kdtree = KDTree(np.array(embeddings)) + _, indexes = kdtree.query(get_embedding_from_text(user_query), k=min(len(titles), 5)) + + if isinstance(indexes, np.int64): + indexes = [indexes] + titles_relevant = [titles[i] for i in indexes if titles[i] != "null"] + if verbose: + print(f"Found these relevant titles: {titles_relevant}") + descriptions_relevant = [get_book_description_by_title(title) for title in titles_relevant] + + recommendation_text = "" + for i, value in enumerate(titles_relevant): + recommendation_text = f"{recommendation_text}{value}: {descriptions_relevant[i]}\n\n##" + + system_prompt = f""" +You are a helpful book recommendation system that can recommend users books based on their inputs. + +Here are the top relevant titles and descriptions (separated by ##) in the format titles: descriptions, +use these to generate your answer, +and disregard books that are not relevant to user's input. You can display 5 or less recommendations.: + +{recommendation_text} + +You should also create a summary of the description and format the answer properly. +You can display a maximum of 5 recommendations. +Please do not suggest any books outside this list. +""" + + if verbose: + print(f"System prompt: {system_prompt}") + + response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_query}, + ], + seed=42, + n=1, + ) + + return response.choices[0].message.content + + +def callback(contents: str, user: str, instance: pn.chat.ChatInterface): + return book_recommender_agent(contents) + + +chat_interface = pn.chat.ChatInterface(callback=callback) +chat_interface.send( + "I am a book recommendation engine! " + "You may ask questions like: \n* Recommend books by Dan Brown.\n" + "* Suggest some books based in the Victorian era.\n\n" + "You can deploy your own by signing up at https://ploomber.io", + user="System", + respond=False, +) + +pn.template.MaterialTemplate( + title="Book Recommender", + main=[chat_interface], +).servable() diff --git a/examples/panel/book-recommender/generate_assets.py b/examples/panel/book-recommender/generate_assets.py new file mode 100644 index 00000000..d3056362 --- /dev/null +++ b/examples/panel/book-recommender/generate_assets.py @@ -0,0 +1,66 @@ +import json +import argparse +import pandas as pd +from pathlib import Path +from openai import OpenAI + +from util import get_embedding_from_text + +# Download the dataset from: +# https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023 +df = pd.read_csv(Path("goodreads.csv")) + +client = OpenAI() + + +def generate_embeddings(rows, verbose=False): + """Function to generate an embeddings.json file that contains + mapping {title: embedding} where embeddings are generated on the + description column of goodreads.csv file. + """ + + final_df = df.head(rows) if rows else df + embeddings_json = {} + if verbose and rows: + print(f"Generating embeddings for {rows} rows") + for index, row in final_df.iterrows(): + if verbose: + print(f"Row number: {index}") + embeddings_json[row["title"]] = get_embedding_from_text(row["description"]) + path = Path("assets", "embeddings.json") + with open(path, 'w') as f: + json.dump(embeddings_json, f) + if verbose: + print(f"Generated embeddings of description column at '{path}'") + + +def generate_lookup(verbose=False): + """Function to generate mappings between columns for faster lookup""" + author_to_title = df.groupby(df['authors'].str.upper())['title'].apply(list).to_dict() + path = Path("assets", "author_to_title.json") + with open(path, 'w') as f: + json.dump(author_to_title, f) + if verbose: + print(f"Generated author to title mappings at '{path}'") + title_to_description = df.groupby(df['title'].str.upper())['description'].apply(list).to_dict() + path = Path("assets", "title_to_description.json") + with open(path, 'w') as f: + json.dump(title_to_description, f) + if verbose: + print(f"Generated title to description mappings at '{path}'") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--verbose', action='store_true', help='Debug') + parser.add_argument('--embeddings', action='store_true', help='Generate embeddings') + parser.add_argument("-n", "--rows", required=False, help="Number of rows to limit for generating embeddings") + args = parser.parse_args() + assets_path = Path("assets") + if not assets_path.exists(): + Path("assets").mkdir() + generate_lookup(args.verbose) + rows = int(args.rows) if args.rows else None + if args.embeddings: + generate_embeddings(rows, args.verbose) + diff --git a/examples/panel/book-recommender/requirements.lock.txt b/examples/panel/book-recommender/requirements.lock.txt new file mode 100644 index 00000000..258985fe --- /dev/null +++ b/examples/panel/book-recommender/requirements.lock.txt @@ -0,0 +1,154 @@ +annotated-types==0.6.0 +anyio==4.3.0 +appnope==0.1.4 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==23.2.0 +Babel==2.14.0 +beautifulsoup4==4.9.3 +bleach==6.1.0 +bokeh==3.3.4 +Brotli==1.1.0 +bs4==0.0.2 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==2.0.12 +click==8.1.7 +comm==0.2.1 +contourpy==1.2.0 +dash==1.0.0 +dash_core_components==1.0.0 +dash_html_components==1.0.0 +dash_renderer==1.0.0 +dash_table==4.0.0 +DateTime==4.3 +debugpy==1.8.1 +decorator==5.1.1 +defusedxml==0.7.1 +distro==1.9.0 +dnspython==1.16.0 +executing==2.0.1 +fastjsonschema==2.19.1 +feedparser==6.0.11 +Flask==2.1.3 +Flask-Compress==1.14 +fqdn==1.5.1 +gnews==0.3.6 +gunicorn==21.2.0 +h11==0.14.0 +httpcore==1.0.4 +httpx==0.27.0 +idna==3.6 +iniconfig==2.0.0 +ipykernel==6.29.3 +ipython==8.22.1 +isoduration==20.11.0 +itsdangerous==2.1.2 +jedi==0.19.1 +Jinja2==3.1.3 +json5==0.9.17 +jsonpointer==2.4 +jsonschema==4.21.1 +jsonschema-specifications==2023.12.1 +jupyter-events==0.9.0 +jupyter-lsp==2.2.3 +jupyter_client==8.6.0 +jupyter_core==5.7.1 +jupyter_server==2.12.5 +jupyter_server_terminals==0.5.2 +jupyterlab==4.1.2 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.25.3 +linkify-it-py==2.0.3 +lxml==5.1.0 +Markdown==3.5.2 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib-inline==0.1.6 +mdit-py-plugins==0.4.0 +mdurl==0.1.2 +mistune==3.0.2 +nbclient==0.9.0 +nbconvert==7.16.1 +nbformat==5.9.2 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +numpy==1.26.4 +openai==1.12.0 +outcome==1.3.0.post0 +overrides==7.7.0 +packaging==23.2 +pandas==2.2.1 +pandocfilters==1.5.1 +panel==1.3.8 +param==2.0.2 +parso==0.8.3 +pathlib==1.0.1 +percy==2.0.2 +pexpect==4.9.0 +pillow==10.2.0 +platformdirs==4.2.0 +plotly==5.19.0 +pluggy==1.4.0 +prometheus_client==0.20.0 +prompt-toolkit==3.0.43 +psutil==5.9.8 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +pydantic==2.6.2 +pydantic_core==2.16.3 +Pygments==2.17.2 +pymongo==3.12.3 +PySocks==1.7.1 +pytest==8.0.2 +pytest-mock==3.12.0 +pytest-sugar==1.0.0 +python-dateutil==2.8.2 +python-dotenv==0.19.2 +python-json-logger==2.0.7 +pytz==2024.1 +pyviz_comms==3.0.1 +PyYAML==6.0.1 +pyzmq==25.1.2 +referencing==0.33.0 +requests==2.31.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.18.0 +scipy==1.12.0 +selenium==4.18.1 +Send2Trash==1.8.2 +sgmllib3k==1.0.0 +six==1.16.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.5 +stack-data==0.6.3 +tenacity==8.2.3 +termcolor==2.4.0 +terminado==0.18.0 +tinycss2==1.2.1 +tornado==6.4 +tqdm==4.66.2 +traitlets==5.14.1 +trio==0.24.0 +trio-websocket==0.11.1 +types-python-dateutil==2.8.19.20240106 +typing_extensions==4.10.0 +tzdata==2024.1 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==1.26.18 +waitress==3.0.0 +wcwidth==0.2.13 +webcolors==1.13 +webencodings==0.5.1 +websocket-client==1.7.0 +Werkzeug==2.3.8 +wsproto==1.2.0 +xyzservices==2023.10.1 +zope.interface==6.2 diff --git a/examples/panel/book-recommender/requirements.txt b/examples/panel/book-recommender/requirements.txt new file mode 100644 index 00000000..52661671 --- /dev/null +++ b/examples/panel/book-recommender/requirements.txt @@ -0,0 +1,4 @@ +panel +openai +scipy +numpy \ No newline at end of file diff --git a/examples/panel/book-recommender/util.py b/examples/panel/book-recommender/util.py new file mode 100644 index 00000000..13de76a7 --- /dev/null +++ b/examples/panel/book-recommender/util.py @@ -0,0 +1,13 @@ +from openai import OpenAI + +client = OpenAI() + + +def get_embedding_from_text(text): + """Generate embedding for a text""" + try: + response = client.embeddings.create(input=text, model="text-embedding-3-small") + embedding = response.data[0].embedding + return embedding + except Exception: + return []