Skip to content

Commit

Permalink
Book recommender (#127)
Browse files Browse the repository at this point in the history
* Book recommender

* imports

* imports

* restructure

* readme

* remove genres

* fixes

* docstrings

* generation script

* assets

* readme

* readme

* minor fixes

* minor

* remove data files

* lock

* fixed command

* readme
  • Loading branch information
neelasha23 authored Mar 1, 2024
1 parent 176ad67 commit 5afc945
Show file tree
Hide file tree
Showing 6 changed files with 438 additions and 0 deletions.
37 changes: 37 additions & 0 deletions examples/panel/book-recommender/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Book Recommender

A chat assistant for recommending books to the user based on inputs.

## Set key

To run this example you need to set the `OPENAI_API_KEY` environment variable.

```bash
export OPENAI_API_KEY=<your_api_key>
```

## Dataset

Download the [dataset](https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023) to the `book-recommender/` folder, and rename it as `goodreads.csv`.


## Generate lookup files

Generate lookup files by running the following script:

```bash
python generate_assets.py --embeddings --verbose
```

Running this command should generate `author_to_title.json`, `title_to_description.json` and `embeddings.json` files in the `assets/` folder.

If you want to generate embedding for the first N rows only run the below command (here N=100):

```bash
python generate_assets.py -n 100 --embeddings --verbose
```

## Deployment

Create a zip of `app.py`, `util.py`, `requirements.txt` and `assets/` folder, and follow the instructions for deploying a [Panel](https://docs.cloud.ploomber.io/en/latest/apps/panel.html) application.
You also need to set `OPENAI_API_KEY` as an [environment variable](https://docs.cloud.ploomber.io/en/latest/user-guide/env-vars.html) while deploying the application.
164 changes: 164 additions & 0 deletions examples/panel/book-recommender/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
Chat application for recommending books to the user.
Input:
Users can submit queries that describe the type of books they are looking for, e.g., suggest fiction novels.
Users can also ask the chat assistant for books by specific author, e.g., recommend books by Dan Brown.
Answers to user's queries will be based on the Goodreads dataset:
https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023
Application logic:
The app determines the closest matches by comparing user query's embedding to the available
book embeddings. Embeddings of books are pre-computed on the description column of every book
and stored in the assets/ folder.
Response:
The chat assistant then determines the top relevant answers shortlisted by comparing embeddings and
provides the top 5 recommendations.
"""

import json
import panel as pn
from openai import OpenAI
from scipy.spatial import KDTree
import numpy as np
from pathlib import Path

from util import get_embedding_from_text

with open(Path("assets", "title_to_description.json"), 'r') as file:
DESCRIPTIONS = json.load(file)

with open(Path("assets", "author_to_title.json"), 'r') as file:
AUTHOR_TITLES = json.load(file)


def load_embeddings_file():
"""Load the pre-computed embeddings of description column
The data is in the format title: embedding
"""
file_path = Path('assets', 'embeddings.json')
with open(file_path, "r", encoding="utf-8") as file:
embeddings_json = json.load(file)
return embeddings_json

client = OpenAI()

pn.extension()


def get_book_description_by_title(title):
"""Return description of a book"""
return DESCRIPTIONS[title.upper()]


def detect_author(user_query):
system_prompt = f"""
You're a system that determines the author in user query.
You need to return only the author name.Please fix any typo if possible
"""
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": "What are some books by Sandra Boynton"},
{"role": "system", "content": "Sandra Boynton"},
{"role": "user", "content": user_query},
],
seed=42,
n=1,
)
author = response.choices[0].message.content.upper()
return author if author in AUTHOR_TITLES else ""


def book_recommender_agent(user_query, verbose=False):
"""An agent that can recommend books to the user based on input"""
embeddings_json = load_embeddings_file()

# If author is mentioned, filter books written by the author.
# Otherwise, consider all the available books.
author = detect_author(user_query)
titles = []
if author:
titles = AUTHOR_TITLES[author]
if verbose:
print(f"Found these titles: {titles} by author: {author}")

filtered_embeddings_by_title = {}
for title in titles:
title_embedding = embeddings_json.get(title, None)
if title_embedding:
filtered_embeddings_by_title[title] = title_embedding
if filtered_embeddings_by_title:
embeddings_json = filtered_embeddings_by_title

titles = []
embeddings = []
for key, value in embeddings_json.items():
if value:
titles.append(key)
embeddings.append(value)
kdtree = KDTree(np.array(embeddings))
_, indexes = kdtree.query(get_embedding_from_text(user_query), k=min(len(titles), 5))

if isinstance(indexes, np.int64):
indexes = [indexes]
titles_relevant = [titles[i] for i in indexes if titles[i] != "null"]
if verbose:
print(f"Found these relevant titles: {titles_relevant}")
descriptions_relevant = [get_book_description_by_title(title) for title in titles_relevant]

recommendation_text = ""
for i, value in enumerate(titles_relevant):
recommendation_text = f"{recommendation_text}{value}: {descriptions_relevant[i]}\n\n##"

system_prompt = f"""
You are a helpful book recommendation system that can recommend users books based on their inputs.
Here are the top relevant titles and descriptions (separated by ##) in the format titles: descriptions,
use these to generate your answer,
and disregard books that are not relevant to user's input. You can display 5 or less recommendations.:
{recommendation_text}
You should also create a summary of the description and format the answer properly.
You can display a maximum of 5 recommendations.
Please do not suggest any books outside this list.
"""

if verbose:
print(f"System prompt: {system_prompt}")

response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_query},
],
seed=42,
n=1,
)

return response.choices[0].message.content


def callback(contents: str, user: str, instance: pn.chat.ChatInterface):
return book_recommender_agent(contents)


chat_interface = pn.chat.ChatInterface(callback=callback)
chat_interface.send(
"I am a book recommendation engine! "
"You may ask questions like: \n* Recommend books by Dan Brown.\n"
"* Suggest some books based in the Victorian era.\n\n"
"You can deploy your own by signing up at https://ploomber.io",
user="System",
respond=False,
)

pn.template.MaterialTemplate(
title="Book Recommender",
main=[chat_interface],
).servable()
66 changes: 66 additions & 0 deletions examples/panel/book-recommender/generate_assets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import json
import argparse
import pandas as pd
from pathlib import Path
from openai import OpenAI

from util import get_embedding_from_text

# Download the dataset from:
# https://www.kaggle.com/datasets/cristaliss/ultimate-book-collection-top-100-books-up-to-2023
df = pd.read_csv(Path("goodreads.csv"))

client = OpenAI()


def generate_embeddings(rows, verbose=False):
"""Function to generate an embeddings.json file that contains
mapping {title: embedding} where embeddings are generated on the
description column of goodreads.csv file.
"""

final_df = df.head(rows) if rows else df
embeddings_json = {}
if verbose and rows:
print(f"Generating embeddings for {rows} rows")
for index, row in final_df.iterrows():
if verbose:
print(f"Row number: {index}")
embeddings_json[row["title"]] = get_embedding_from_text(row["description"])
path = Path("assets", "embeddings.json")
with open(path, 'w') as f:
json.dump(embeddings_json, f)
if verbose:
print(f"Generated embeddings of description column at '{path}'")


def generate_lookup(verbose=False):
"""Function to generate mappings between columns for faster lookup"""
author_to_title = df.groupby(df['authors'].str.upper())['title'].apply(list).to_dict()
path = Path("assets", "author_to_title.json")
with open(path, 'w') as f:
json.dump(author_to_title, f)
if verbose:
print(f"Generated author to title mappings at '{path}'")
title_to_description = df.groupby(df['title'].str.upper())['description'].apply(list).to_dict()
path = Path("assets", "title_to_description.json")
with open(path, 'w') as f:
json.dump(title_to_description, f)
if verbose:
print(f"Generated title to description mappings at '{path}'")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--verbose', action='store_true', help='Debug')
parser.add_argument('--embeddings', action='store_true', help='Generate embeddings')
parser.add_argument("-n", "--rows", required=False, help="Number of rows to limit for generating embeddings")
args = parser.parse_args()
assets_path = Path("assets")
if not assets_path.exists():
Path("assets").mkdir()
generate_lookup(args.verbose)
rows = int(args.rows) if args.rows else None
if args.embeddings:
generate_embeddings(rows, args.verbose)

Loading

0 comments on commit 5afc945

Please sign in to comment.