diff --git a/.gitignore b/.gitignore
index 5025672..c368399 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,180 @@
-arxiv-metadata-oai-snapshot.json
+/data/arxiv-metadata-oai-snapshot.json
+/data/**/checkpoint*/
*.pkl
*.DS_STORE
-*.log
\ No newline at end of file
+*.log
+
+## Standard Python ignores:
+arxiv-metadata-oai-snapshot.json
+*.DS_STORE
+*.log
+.tool-versions
+data/.ipynb_checkpoints/
+.env
+
+data/wandb/**
+data/paper-multilabel-finetuning
+*.zip
+data/checkpoint
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/README.md b/README.md
index 243baca..2e403e9 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,21 @@ Both **Redis Stack** and the paper search app run with **Docker Compose** using
$ docker compose -f docker-local-redis.yml up
```
+### Running the code locally, Redis in Docker
+For local development.
+
+Build frontend and install Python dependencies locally:
+```bash
+$ sh /install-local.sh # compiles frontend, installs backend dependencies
+```
+
+Run local Redis in docker, but execute code locally so that code changes are reloaded automatically:
+```bash
+$ sh run-local.sh # runs Redis in local docker, runs code in local environment without docker
+```
+
+If you don't have command `docker compose` but have `docker-compose`, do `export DOCKER_COMPOSE="docker-compose"` prior to `run-local.sh`.
+
### Customizing (optional)
You can use the Jupyter Notebooks in the [`data/`](data/README.md) directory to create paper embeddings and metadata. The pickled dataframe will end up stored in the `data/` directory and used when creating your own container.
diff --git a/backend/vecsim_app/api/routes.py b/backend/vecsim_app/api/routes.py
index f894e14..20c7c9c 100644
--- a/backend/vecsim_app/api/routes.py
+++ b/backend/vecsim_app/api/routes.py
@@ -1,23 +1,35 @@
import asyncio
+import logging
import typing as t
-import redis.asyncio as redis
+import redis.asyncio as redis
from fastapi import APIRouter
from vecsim_app import config
+from vecsim_app.categories import CATEGORIES
from vecsim_app.embeddings import Embeddings
from vecsim_app.models import Paper
-
+from vecsim_app.multilabel_classifier.inference import load_models, predict_categories
from vecsim_app.schema import (
+ CategoriesPredictionRequest,
SimilarityRequest,
UserTextSimilarityRequest
)
from vecsim_app.search_index import SearchIndex
-
paper_router = r = APIRouter()
redis_client = redis.from_url(config.REDIS_URL)
embeddings = Embeddings()
search_index = SearchIndex()
+STATE = {}
+
+mlc_path = f"{config.DATA_LOCATION}/multilabel_classifier/checkpoint"
+mlc_model, mlc_tokenizer, mlc_b = load_models(mlc_path, f"{mlc_path}/mlb.pkl")
+
+
+def _cut_off_category_description(c: str):
+ # 'q-fin.TR (Trading and Market Microstructure)' -> 'q-fin.TR`
+ return c.split()[0]
+
async def process_paper(p, i: int) -> t.Dict[str, t.Any]:
paper = await Paper.get(p.paper_pk)
@@ -26,42 +38,44 @@ async def process_paper(p, i: int) -> t.Dict[str, t.Any]:
paper['similarity_score'] = score
return paper
+
async def papers_from_results(total, results) -> t.Dict[str, t.Any]:
# extract papers from VSS results
- return {
- 'total': total,
- 'papers': [
- await process_paper(p, i)
- for i, p in enumerate(results.docs)
+ results = [await process_paper(p, i) for i, p in enumerate(results.docs)]
+ dump = "\n".join(
+ [
+ f" [{r['similarity_score']:.3f}] " + r['title'].replace('\n', ' ')
+ for r in results
]
+ )
+ logging.debug(f"Retrieved {len(results)} papers:\n" + dump)
+ return {
+ "total": total,
+ "papers": results,
}
@r.get("/", response_model=t.Dict)
async def get_papers(
- limit: int = 20,
- skip: int = 0,
- years: str = "",
- categories: str = ""
+ limit: int = 20, skip: int = 0, years: str = "", categories: str = ""
):
papers = []
expressions = []
- years = [year for year in years.split(",") if year]
- categories = [cat for cat in categories.split(",") if cat]
+ years = [y for y in years.split(",") if y]
+ categories = [_cut_off_category_description(c) for c in categories.split(",") if c]
if years and categories:
- expressions.append(
- (Paper.year << years) & \
- (Paper.categories << categories)
- )
+ expressions.append((Paper.year << years) & (Paper.categories << categories))
elif years and not categories:
expressions.append(Paper.year << years)
elif categories and not years:
expressions.append(Paper.categories << categories)
# Run query
- papers = await Paper.find(*expressions)\
- .copy(offset=skip, limit=limit)\
+ papers = (
+ await Paper.find(*expressions)
+ .copy(offset=skip, limit=limit)
.execute(exhaust_results=False)
+ )
# Get total count
total = (
@@ -69,24 +83,39 @@ async def get_papers(
search_index.count_query(years=years, categories=categories)
)
).total
+ return {"total": total, "papers": papers}
+
+
+@r.post("/predict-categories", response_model=t.Dict)
+async def route_predict_categories(categories_request: CategoriesPredictionRequest):
+ categories = predict_categories(
+ categories_request.articles,
+ mlc_model,
+ mlc_tokenizer,
+ mlc_b,
+ proba_threshold=categories_request.proba_threshold,
+ )
return {
- 'total': total,
- 'papers': papers
+ "categories": categories,
+ "categories_names": [CATEGORIES.get(c) for c in categories],
}
@r.post("/vectorsearch/text", response_model=t.Dict)
async def find_papers_by_text(similarity_request: SimilarityRequest):
# Create query
+ categories = [
+ _cut_off_category_description(c) for c in similarity_request.categories
+ ]
query = search_index.vector_query(
- similarity_request.categories,
+ categories,
similarity_request.years,
similarity_request.search_type,
- similarity_request.number_of_results
+ similarity_request.number_of_results,
+ categories_operator=similarity_request.categories_operator,
)
count_query = search_index.count_query(
- years=similarity_request.years,
- categories=similarity_request.categories
+ years=similarity_request.years, categories=similarity_request.categories
)
# find the vector of the Paper listed in the request
@@ -96,7 +125,9 @@ async def find_papers_by_text(similarity_request: SimilarityRequest):
# obtain results of the queries
total, results = await asyncio.gather(
redis_client.ft(config.INDEX_NAME).search(count_query),
- redis_client.ft(config.INDEX_NAME).search(query, query_params={"vec_param": vector})
+ redis_client.ft(config.INDEX_NAME).search(
+ query, query_params={"vec_param": vector}
+ ),
)
# Get Paper records of those results
@@ -106,27 +137,38 @@ async def find_papers_by_text(similarity_request: SimilarityRequest):
@r.post("/vectorsearch/text/user", response_model=t.Dict)
async def find_papers_by_user_text(similarity_request: UserTextSimilarityRequest):
# Create query
+ categories = [
+ _cut_off_category_description(c) for c in similarity_request.categories
+ ]
+
query = search_index.vector_query(
- similarity_request.categories,
+ categories,
similarity_request.years,
similarity_request.search_type,
- similarity_request.number_of_results
+ similarity_request.number_of_results,
)
count_query = search_index.count_query(
- years=similarity_request.years,
- categories=similarity_request.categories
+ years=similarity_request.years, categories=similarity_request.categories
)
- # obtain results of the queries
- total, results = await asyncio.gather(
+ articles = [a["text"] for a in similarity_request.articles if a["text"].strip()]
+ if not articles:
+ return {}
+ article_embeddings = [embeddings.make(a) for a in articles]
+ mid_embedding = sum(article_embeddings) / len(article_embeddings)
+
+ # debug:
+ for ae in article_embeddings:
+ logging.debug(ae[:5])
+ logging.debug(mid_embedding[:5])
+
+ total, result = await asyncio.gather(
redis_client.ft(config.INDEX_NAME).search(count_query),
redis_client.ft(config.INDEX_NAME).search(
query,
- query_params={
- "vec_param": embeddings.make(similarity_request.user_text).tobytes()
- }
- )
+ query_params={"vec_param": mid_embedding.tobytes()},
+ ),
)
# Get Paper records of those results
- return await papers_from_results(total.total, results)
+ return await papers_from_results(total.total, result)
diff --git a/backend/vecsim_app/categories.py b/backend/vecsim_app/categories.py
new file mode 100644
index 0000000..25dbe73
--- /dev/null
+++ b/backend/vecsim_app/categories.py
@@ -0,0 +1,155 @@
+CATEGORIES = {
+ "astro-ph": "Astrophysics",
+ "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
+ "astro-ph.EP": "Earth and Planetary Astrophysics",
+ "astro-ph.GA": "Astrophysics of Galaxies",
+ "astro-ph.HE": "High Energy Astrophysical Phenomena",
+ "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
+ "astro-ph.SR": "Solar and Stellar Astrophysics",
+ "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
+ "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
+ "cond-mat.mtrl-sci": "Materials Science",
+ "cond-mat.other": "Other Condensed Matter",
+ "cond-mat.quant-gas": "Quantum Gases",
+ "cond-mat.soft": "Soft Condensed Matter",
+ "cond-mat.stat-mech": "Statistical Mechanics",
+ "cond-mat.str-el": "Strongly Correlated Electrons",
+ "cond-mat.supr-con": "Superconductivity",
+ "cs.AI": "Artificial Intelligence",
+ "cs.AR": "Hardware Architecture",
+ "cs.CC": "Computational Complexity",
+ "cs.CE": "Computational Engineering, Finance, and Science",
+ "cs.CG": "Computational Geometry",
+ "cs.CL": "Computation and Language",
+ "cs.CR": "Cryptography and Security",
+ "cs.CV": "Computer Vision and Pattern Recognition",
+ "cs.CY": "Computers and Society",
+ "cs.DB": "Databases",
+ "cs.DC": "Distributed, Parallel, and Cluster Computing",
+ "cs.DL": "Digital Libraries",
+ "cs.DM": "Discrete Mathematics",
+ "cs.DS": "Data Structures and Algorithms",
+ "cs.ET": "Emerging Technologies",
+ "cs.FL": "Formal Languages and Automata Theory",
+ "cs.GL": "General Literature",
+ "cs.GR": "Graphics",
+ "cs.GT": "Computer Science and Game Theory",
+ "cs.HC": "Human-Computer Interaction",
+ "cs.IR": "Information Retrieval",
+ "cs.IT": "Information Theory",
+ "cs.LG": "Machine Learning",
+ "cs.LO": "Logic in Computer Science",
+ "cs.MA": "Multiagent Systems",
+ "cs.MM": "Multimedia",
+ "cs.MS": "Mathematical Software",
+ "cs.NA": "Numerical Analysis",
+ "cs.NE": "Neural and Evolutionary Computing",
+ "cs.NI": "Networking and Internet Architecture",
+ "cs.OH": "Other Computer Science",
+ "cs.OS": "Operating Systems",
+ "cs.PF": "Performance",
+ "cs.PL": "Programming Languages",
+ "cs.RO": "Robotics",
+ "cs.SC": "Symbolic Computation",
+ "cs.SD": "Sound",
+ "cs.SE": "Software Engineering",
+ "cs.SI": "Social and Information Networks",
+ "cs.SY": "Systems and Control",
+ "econ.EM": "Econometrics",
+ "eess.AS": "Audio and Speech Processing",
+ "eess.IV": "Image and Video Processing",
+ "eess.SP": "Signal Processing",
+ "gr-qc": "General Relativity and Quantum Cosmology",
+ "hep-ex": "High Energy Physics - Experiment",
+ "hep-lat": "High Energy Physics - Lattice",
+ "hep-ph": "High Energy Physics - Phenomenology",
+ "hep-th": "High Energy Physics - Theory",
+ "math.AC": "Commutative Algebra",
+ "math.AG": "Algebraic Geometry",
+ "math.AP": "Analysis of PDEs",
+ "math.AT": "Algebraic Topology",
+ "math.CA": "Classical Analysis and ODEs",
+ "math.CO": "Combinatorics",
+ "math.CT": "Category Theory",
+ "math.CV": "Complex Variables",
+ "math.DG": "Differential Geometry",
+ "math.DS": "Dynamical Systems",
+ "math.FA": "Functional Analysis",
+ "math.GM": "General Mathematics",
+ "math.GN": "General Topology",
+ "math.GR": "Group Theory",
+ "math.GT": "Geometric Topology",
+ "math.HO": "History and Overview",
+ "math.IT": "Information Theory",
+ "math.KT": "K-Theory and Homology",
+ "math.LO": "Logic",
+ "math.MG": "Metric Geometry",
+ "math.MP": "Mathematical Physics",
+ "math.NA": "Numerical Analysis",
+ "math.NT": "Number Theory",
+ "math.OA": "Operator Algebras",
+ "math.OC": "Optimization and Control",
+ "math.PR": "Probability",
+ "math.QA": "Quantum Algebra",
+ "math.RA": "Rings and Algebras",
+ "math.RT": "Representation Theory",
+ "math.SG": "Symplectic Geometry",
+ "math.SP": "Spectral Theory",
+ "math.ST": "Statistics Theory",
+ "math-ph": "Mathematical Physics",
+ "nlin.AO": "Adaptation and Self-Organizing Systems",
+ "nlin.CD": "Chaotic Dynamics",
+ "nlin.CG": "Cellular Automata and Lattice Gases",
+ "nlin.PS": "Pattern Formation and Solitons",
+ "nlin.SI": "Exactly Solvable and Integrable Systems",
+ "nucl-ex": "Nuclear Experiment",
+ "nucl-th": "Nuclear Theory",
+ "physics.acc-ph": "Accelerator Physics",
+ "physics.ao-ph": "Atmospheric and Oceanic Physics",
+ "physics.app-ph": "Applied Physics",
+ "physics.atm-clus": "Atomic and Molecular Clusters",
+ "physics.atom-ph": "Atomic Physics",
+ "physics.bio-ph": "Biological Physics",
+ "physics.chem-ph": "Chemical Physics",
+ "physics.class-ph": "Classical Physics",
+ "physics.comp-ph": "Computational Physics",
+ "physics.data-an": "Data Analysis, Statistics and Probability",
+ "physics.ed-ph": "Physics Education",
+ "physics.flu-dyn": "Fluid Dynamics",
+ "physics.gen-ph": "General Physics",
+ "physics.geo-ph": "Geophysics",
+ "physics.hist-ph": "History and Philosophy of Physics",
+ "physics.ins-det": "Instrumentation and Detectors",
+ "physics.med-ph": "Medical Physics",
+ "physics.optics": "Optics",
+ "physics.plasm-ph": "Plasma Physics",
+ "physics.pop-ph": "Popular Physics",
+ "physics.soc-ph": "Physics and Society",
+ "physics.space-ph": "Space Physics",
+ "q-bio.BM": "Biomolecules",
+ "q-bio.CB": "Cell Behavior",
+ "q-bio.GN": "Genomics",
+ "q-bio.MN": "Molecular Networks",
+ "q-bio.NC": "Neurons and Cognition",
+ "q-bio.OT": "Other Quantitative Biology",
+ "q-bio.PE": "Populations and Evolution",
+ "q-bio.QM": "Quantitative Methods",
+ "q-bio.SC": "Subcellular Processes",
+ "q-bio.TO": "Tissues and Organs",
+ "q-fin.CP": "Computational Finance",
+ "q-fin.EC": "Economics",
+ "q-fin.GN": "General Finance",
+ "q-fin.MF": "Mathematical Finance",
+ "q-fin.PM": "Portfolio Management",
+ "q-fin.PR": "Pricing of Securities",
+ "q-fin.RM": "Risk Management",
+ "q-fin.ST": "Statistical Finance",
+ "q-fin.TR": "Trading and Market Microstructure",
+ "quant-ph": "Quantum Physics",
+ "stat.AP": "Applications",
+ "stat.CO": "Computation",
+ "stat.ME": "Methodology",
+ "stat.ML": "Machine Learning",
+ "stat.OT": "Other Statistics",
+ "stat.TH": "Statistics Theory",
+}
diff --git a/backend/vecsim_app/config.py b/backend/vecsim_app/config.py
index 111d95d..3ae965d 100644
--- a/backend/vecsim_app/config.py
+++ b/backend/vecsim_app/config.py
@@ -10,6 +10,8 @@
REDIS_DB = os.environ.get("REDIS_DB", 0)
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "testing123")
REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}"
+SERVER_HOST = os.environ.get("SERVER_HOSt", "0.0.0.0")
+SERVER_PORT = os.environ.get("SERVER_PORT", 8888)
os.environ["REDIS_DATA_URL"] = REDIS_URL
os.environ["REDIS_OM_URL"] = REDIS_URL
API_V1_STR = "/api/v1"
diff --git a/backend/vecsim_app/data_utils.py b/backend/vecsim_app/data_utils.py
new file mode 100644
index 0000000..96d7f5f
--- /dev/null
+++ b/backend/vecsim_app/data_utils.py
@@ -0,0 +1,35 @@
+import json
+import re
+
+
+def _process(paper: dict, year_pattern: str):
+ paper = json.loads(paper)
+ if paper["journal-ref"]:
+ years = [int(year) for year in re.findall(year_pattern, paper["journal-ref"])]
+ years = [year for year in years if (year <= 2022 and year >= 1991)]
+ year = min(years) if years else None
+ else:
+ year = None
+ return {
+ "id": paper["id"],
+ "title": paper["title"],
+ "year": year,
+ "authors": paper["authors"],
+ "categories": ",".join(paper["categories"].split(" ")),
+ "abstract": paper["abstract"],
+ }
+
+
+def papers(
+ data_path: str, year_cutoff: int, year_pattern: str, ml_category: str = None
+):
+ with open(data_path, "r") as f:
+ for paper in f:
+ paper = _process(paper, year_pattern)
+ if paper["year"]:
+ m = ml_category
+ ml_category_condition = (
+ m is not None and m in paper["categories"] or m is None
+ )
+ if paper["year"] >= year_cutoff and ml_category_condition:
+ yield paper
diff --git a/backend/vecsim_app/entrypoint.sh b/backend/vecsim_app/entrypoint.sh
old mode 100644
new mode 100755
index 7a81268..98f63b5
--- a/backend/vecsim_app/entrypoint.sh
+++ b/backend/vecsim_app/entrypoint.sh
@@ -1,4 +1,5 @@
#!/bin/sh
+set -eux
python load_data.py
diff --git a/backend/vecsim_app/load_data.py b/backend/vecsim_app/load_data.py
index cff5c15..532e0df 100644
--- a/backend/vecsim_app/load_data.py
+++ b/backend/vecsim_app/load_data.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
-import typing as t
import asyncio
-import numpy as np
import pickle
-import redis.asyncio as redis
+import typing as t
+import numpy as np
+import redis.asyncio as redis
from redis.commands.search.field import TagField
from vecsim_app import config
from vecsim_app.models import Paper
@@ -12,12 +12,17 @@
def read_paper_df() -> t.List:
- with open(config.DATA_LOCATION + "/arxiv_embeddings_10000.pkl", "rb") as f:
+ path = config.DATA_LOCATION + "/embeddings/arxiv_embeddings_400000.pkl"
+ print(f"Loading data from : {path}")
+ with open(path, "rb") as f:
df = pickle.load(f)
+ print(f"Loaded {len(df)} items")
return df
+
async def gather_with_concurrency(n, redis_conn, *papers):
semaphore = asyncio.Semaphore(n)
+
async def load_paper(paper):
async with semaphore:
vector = paper.pop('vector')
@@ -26,7 +31,7 @@ async def load_paper(paper):
paper['categories'] = paper['categories'].replace(",", "|")
p = Paper(**paper)
# save model TODO -- combine these two objects eventually
- await p.save()
+ await p.save(redis_conn)
# save vector data
key = "paper_vector:" + str(p.paper_id)
await redis_conn.hset(
@@ -37,10 +42,12 @@ async def load_paper(paper):
"categories": p.categories,
"year": p.year,
"vector": np.array(vector, dtype=np.float32).tobytes(),
- })
+ })
+
# gather with concurrency
await asyncio.gather(*[load_paper(p) for p in papers])
+
async def load_all_data():
# TODO use redis-om connection
redis_conn = redis.from_url(config.REDIS_URL)
diff --git a/backend/vecsim_app/main.py b/backend/vecsim_app/main.py
index 93b054c..deacc3f 100644
--- a/backend/vecsim_app/main.py
+++ b/backend/vecsim_app/main.py
@@ -1,19 +1,17 @@
import uvicorn
+import logging
from pathlib import Path
-from aredis_om import (
- get_redis_connection,
- Migrator
-)
+from aredis_om import Migrator, get_redis_connection
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from starlette.middleware.cors import CORSMiddleware
-
from vecsim_app import config
-from vecsim_app.models import Paper
from vecsim_app.api import routes
+from vecsim_app.models import Paper
from vecsim_app.spa import SinglePageApplication
+logging.basicConfig(level=logging.DEBUG)
app = FastAPI(
title=config.PROJECT_NAME,
@@ -21,14 +19,6 @@
openapi_url=config.OPENAPI_DOCS
)
-app.add_middleware(
- CORSMiddleware,
- allow_origins="*",
- allow_credentials=True,
- allow_methods=["*"],
- allow_headers=["*"]
-)
-
# Routers
app.include_router(
routes.paper_router,
@@ -55,21 +45,37 @@ async def startup():
app.mount(
path="/", app=SinglePageApplication(directory=gui_build_dir), name="SPA"
)
-
if __name__ == "__main__":
+ import logging
import os
+
+ logging.basicConfig(level=logging.INFO)
+
env = os.environ.get("DEPLOYMENT", "prod")
+ logging.info(f"Running in {env} mode")
server_attr = {
- "host": "0.0.0.0",
+ "host": config.SERVER_HOST,
"reload": True,
- "port": 8888,
- "workers": 1
+ "port": int(config.SERVER_PORT),
+ "workers": 1,
+ "log_level": "debug",
}
if env == "prod":
- server_attr.update({"reload": False,
- "workers": 2,
- "ssl_keyfile": "key.pem",
- "ssl_certfile": "full.pem"})
+ server_attr.update(
+ {
+ "reload": False,
+ "workers": 2,
+ "ssl_keyfile": "key.pem",
+ "ssl_certfile": "full.pem",
+ }
+ )
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins="*",
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"]
+ )
- uvicorn.run("main:app", **server_attr)
+ uvicorn.run("vecsim_app.main:app", **server_attr)
diff --git a/backend/vecsim_app/multilabel_classifier/__init__.py b/backend/vecsim_app/multilabel_classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/vecsim_app/multilabel_classifier/inference.py b/backend/vecsim_app/multilabel_classifier/inference.py
new file mode 100644
index 0000000..82d7d0d
--- /dev/null
+++ b/backend/vecsim_app/multilabel_classifier/inference.py
@@ -0,0 +1,61 @@
+import pickle
+from typing import List
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, BertForSequenceClassification
+
+
+def predict_categories_on_single_text(text, model, tokenizer, mlb, proba_threshold=0.5):
+
+ encoding = tokenizer(text, return_tensors="pt")
+ encoding = {k: v.to(model.device) for k, v in encoding.items()}
+
+ outputs = model(**encoding)
+ logits = outputs.logits
+
+ # apply sigmoid + threshold
+ sigmoid = torch.nn.Sigmoid()
+ probs = sigmoid(logits.squeeze().cpu())
+ # predictions = probs.detach().numpy()
+ predictions = np.zeros(probs.shape)
+ predictions[np.where(probs >= proba_threshold)] = 1
+
+ classes = mlb.inverse_transform(predictions.reshape(1, -1))
+
+ if len(classes) > 0:
+ classes = classes[0]
+ else:
+ classes = []
+
+ return classes, probs
+
+
+def load_models(
+ multilabel_model_path="categories", multilabel_binarizer_path="mlb.pkl"
+):
+ model = BertForSequenceClassification.from_pretrained(
+ multilabel_model_path, problem_type="multi_label_classification"
+ )
+
+ tokenizer = AutoTokenizer.from_pretrained(multilabel_model_path)
+
+ with open(multilabel_binarizer_path, "rb") as handle:
+ mlb = pickle.load(handle)
+
+ return model, tokenizer, mlb
+
+
+def predict_categories(queries: List[str], model, tokenizer, mlb, proba_threshold=0.45):
+
+ categories = []
+
+ for query in queries:
+ cat, probs = predict_categories_on_single_text(
+ query, model, tokenizer, mlb, proba_threshold=proba_threshold
+ )
+
+ categories.extend(cat)
+
+ # return sorted(categories.items())
+ return sorted(set(categories))
diff --git a/backend/vecsim_app/schema/__init__.py b/backend/vecsim_app/schema/__init__.py
index 78bdb9a..866d5fe 100644
--- a/backend/vecsim_app/schema/__init__.py
+++ b/backend/vecsim_app/schema/__init__.py
@@ -1,4 +1,2 @@
-from .search import (
- SimilarityRequest,
- UserTextSimilarityRequest
-)
\ No newline at end of file
+from .predict_categories import CategoriesPredictionRequest
+from .search import SimilarityRequest, UserTextSimilarityRequest
diff --git a/backend/vecsim_app/schema/predict_categories.py b/backend/vecsim_app/schema/predict_categories.py
new file mode 100644
index 0000000..c72197c
--- /dev/null
+++ b/backend/vecsim_app/schema/predict_categories.py
@@ -0,0 +1,6 @@
+from pydantic import BaseModel
+
+
+class CategoriesPredictionRequest(BaseModel):
+ articles: list
+ proba_threshold: float = 0.35
diff --git a/backend/vecsim_app/schema/search.py b/backend/vecsim_app/schema/search.py
index 8b7b9dc..e959e43 100644
--- a/backend/vecsim_app/schema/search.py
+++ b/backend/vecsim_app/schema/search.py
@@ -7,9 +7,11 @@ class SimilarityRequest(BaseModel):
years: list
number_of_results: int = 15
search_type: str = "KNN"
+ categories_operator: str = "AND"
+
class UserTextSimilarityRequest(BaseModel):
- user_text: str
+ articles: list
categories: list
years: list
number_of_results: int = 15
diff --git a/backend/vecsim_app/search_index.py b/backend/vecsim_app/search_index.py
index 423e53a..0c3f255 100644
--- a/backend/vecsim_app/search_index.py
+++ b/backend/vecsim_app/search_index.py
@@ -1,11 +1,12 @@
+import logging
import re
+from typing import Optional, Pattern
-from config import INDEX_NAME
from redis.asyncio import Redis
-from redis.commands.search.query import Query
-from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.field import VectorField
-from typing import Optional, Pattern
+from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+from redis.commands.search.query import Query
+from vecsim_app.config import INDEX_NAME
class TokenEscaper:
@@ -23,12 +24,15 @@ def __init__(self, escape_chars_re: Optional[Pattern] = None):
self.escaped_chars_re = re.compile(self.DEFAULT_ESCAPED_CHARS)
def escape(self, value: str) -> str:
+ value = str(value)
+
def escape_symbol(match):
value = match.group(0)
return f"\\{value}"
return self.escaped_chars_re.sub(escape_symbol, value)
+
class SearchIndex:
"""
SearchIndex is used to wrap and capture all information
@@ -62,8 +66,7 @@ async def create_flat(
"DISTANCE_METRIC": distance_metric,
"INITIAL_CAP": number_of_vectors,
"BLOCK_SIZE": number_of_vectors
- }
- )
+ })
await self._create(
*fields,
vector_field,
@@ -95,14 +98,8 @@ async def create_hnsw(
"DIM": 768,
"DISTANCE_METRIC": distance_metric,
"INITIAL_CAP": number_of_vectors,
- }
- )
- await self._create(
- *fields,
- vector_field,
- redis_conn=redis_conn,
- prefix=prefix
- )
+ })
+ await self._create(*fields, vector_field, redis_conn=redis_conn, prefix=prefix)
async def _create(
self,
@@ -116,7 +113,9 @@ async def _create(
definition= IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
)
- def process_tags(self, categories: list, years: list) -> str:
+ def process_tags(
+ self, categories: list, years: list, categories_operator="AND"
+ ) -> str:
"""
Helper function to process tags data. TODO - factor this
out so it's agnostic to the name of the field.
@@ -128,33 +127,39 @@ def process_tags(self, categories: list, years: list) -> str:
Returns:
str: RediSearch tag query string.
"""
- tag = "("
+ tag = []
if years:
- years = "|".join([self.escaper.escape(year) for year in years])
- tag += f"(@year:{{{years}}})"
+ years = "{" + "|".join([self.escaper.escape(y) for y in years]) + "}"
+ tag.append(f"(@year:{years})")
+
if categories:
- categories = "|".join([self.escaper.escape(cat) for cat in categories])
- if tag:
- tag += f" (@categories:{{{categories}}})"
+ if categories_operator == "AND":
+ for c in categories:
+ cat = "{" + self.escaper.escape(c) + "}"
+ tag.append(f"(@categories:{cat})")
+ elif categories_operator == "OR":
+ cat = "{" + "|".join([self.escaper.escape(c) for c in categories]) + "}"
+ tag.append(f"(@categories:{cat})")
else:
- tag += f"(@categories:{{{categories}}})"
- tag += ")"
- # if no tags are selected
- if len(tag) < 3:
- tag = "*"
- return tag
+ raise ValueError(f"Unsupported categories_operator: {categories_operator}")
+
+ if tag:
+ tag = ["("] + tag + [")"]
+ else:
+ tag = ["*"]
+
+ return "".join(tag)
def vector_query(
self,
categories: list,
years: list,
- search_type: str="KNN",
- number_of_results: int=20
+ search_type: str='KNN',
+ number_of_results: int=20,
+ categories_operator: str='AND',
) -> Query:
"""
Create a RediSearch query to perform hybrid vector and tag based searches.
-
-
Args:
categories (list): List of categories.
years (list): List of years.
@@ -166,13 +171,16 @@ def vector_query(
"""
# Parse tags to create query
- tag_query = self.process_tags(categories, years)
- base_query = f'{tag_query}=>[{search_type} {number_of_results} @vector $vec_param AS vector_score]'
- return Query(base_query)\
- .sort_by("vector_score")\
- .paging(0, number_of_results)\
- .return_fields("paper_id", "paper_pk", "vector_score")\
+ tag_query = self.process_tags(categories, years, categories_operator)
+ base_query = f"{tag_query}=>[{search_type} {number_of_results} @vector $vec_param AS vector_score]"
+ logging.debug(f"base_query: {base_query}")
+ return (
+ Query(base_query)
+ .sort_by("vector_score")
+ .paging(0, number_of_results)
+ .return_fields("paper_id", "paper_pk", "vector_score")
.dialect(2)
+ )
def count_query(
self,
@@ -191,7 +199,5 @@ def count_query(
"""
# Parse tags to create query
tag_query = self.process_tags(categories, years)
- return Query(f'{tag_query}')\
- .no_content()\
- .dialect(2)
-
+ logging.debug(f"tag_query: {tag_query}")
+ return Query(f"{tag_query}").no_content().dialect(2)
diff --git a/data/README.md b/data/README.md
index 4248801..5e473aa 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,8 +1,9 @@
+
# Data!
Generate data before trying to run this application.
-### Three Notebooks
+### Notebooks
1. `arxiv-embeddings.ipynb` (app default)
- Uses local CPU and creates embeddings for ~10k machine learning papers.
@@ -17,3 +18,10 @@ Generate data before trying to run this application.
- Output: `arxiv_embeddings_300000pkl`.
+4. `multilabel-model.ipynb`
+ - A multilabel classification model as each paper can have more than one category. We fine-tuned a transformers model (`bert-base-uncased`).
+ - Output: `mlb.pickle`, `checkpoint` folder with the NLP model weights.
+
+5. `multilabel-inference`
+ - Script showcasing the inference for the multilabel model.
+ - Output: N/A
\ No newline at end of file
diff --git a/data/categories.py b/data/categories.py
deleted file mode 100644
index d2ec10a..0000000
--- a/data/categories.py
+++ /dev/null
@@ -1,155 +0,0 @@
-_map = {
- 'astro-ph': 'Astrophysics',
- 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
- 'astro-ph.EP': 'Earth and Planetary Astrophysics',
- 'astro-ph.GA': 'Astrophysics of Galaxies',
- 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
- 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
- 'astro-ph.SR': 'Solar and Stellar Astrophysics',
- 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
- 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
- 'cond-mat.mtrl-sci': 'Materials Science',
- 'cond-mat.other': 'Other Condensed Matter',
- 'cond-mat.quant-gas': 'Quantum Gases',
- 'cond-mat.soft': 'Soft Condensed Matter',
- 'cond-mat.stat-mech': 'Statistical Mechanics',
- 'cond-mat.str-el': 'Strongly Correlated Electrons',
- 'cond-mat.supr-con': 'Superconductivity',
- 'cs.AI': 'Artificial Intelligence',
- 'cs.AR': 'Hardware Architecture',
- 'cs.CC': 'Computational Complexity',
- 'cs.CE': 'Computational Engineering, Finance, and Science',
- 'cs.CG': 'Computational Geometry',
- 'cs.CL': 'Computation and Language',
- 'cs.CR': 'Cryptography and Security',
- 'cs.CV': 'Computer Vision and Pattern Recognition',
- 'cs.CY': 'Computers and Society',
- 'cs.DB': 'Databases',
- 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
- 'cs.DL': 'Digital Libraries',
- 'cs.DM': 'Discrete Mathematics',
- 'cs.DS': 'Data Structures and Algorithms',
- 'cs.ET': 'Emerging Technologies',
- 'cs.FL': 'Formal Languages and Automata Theory',
- 'cs.GL': 'General Literature',
- 'cs.GR': 'Graphics',
- 'cs.GT': 'Computer Science and Game Theory',
- 'cs.HC': 'Human-Computer Interaction',
- 'cs.IR': 'Information Retrieval',
- 'cs.IT': 'Information Theory',
- 'cs.LG': 'Machine Learning',
- 'cs.LO': 'Logic in Computer Science',
- 'cs.MA': 'Multiagent Systems',
- 'cs.MM': 'Multimedia',
- 'cs.MS': 'Mathematical Software',
- 'cs.NA': 'Numerical Analysis',
- 'cs.NE': 'Neural and Evolutionary Computing',
- 'cs.NI': 'Networking and Internet Architecture',
- 'cs.OH': 'Other Computer Science',
- 'cs.OS': 'Operating Systems',
- 'cs.PF': 'Performance',
- 'cs.PL': 'Programming Languages',
- 'cs.RO': 'Robotics',
- 'cs.SC': 'Symbolic Computation',
- 'cs.SD': 'Sound',
- 'cs.SE': 'Software Engineering',
- 'cs.SI': 'Social and Information Networks',
- 'cs.SY': 'Systems and Control',
- 'econ.EM': 'Econometrics',
- 'eess.AS': 'Audio and Speech Processing',
- 'eess.IV': 'Image and Video Processing',
- 'eess.SP': 'Signal Processing',
- 'gr-qc': 'General Relativity and Quantum Cosmology',
- 'hep-ex': 'High Energy Physics - Experiment',
- 'hep-lat': 'High Energy Physics - Lattice',
- 'hep-ph': 'High Energy Physics - Phenomenology',
- 'hep-th': 'High Energy Physics - Theory',
- 'math.AC': 'Commutative Algebra',
- 'math.AG': 'Algebraic Geometry',
- 'math.AP': 'Analysis of PDEs',
- 'math.AT': 'Algebraic Topology',
- 'math.CA': 'Classical Analysis and ODEs',
- 'math.CO': 'Combinatorics',
- 'math.CT': 'Category Theory',
- 'math.CV': 'Complex Variables',
- 'math.DG': 'Differential Geometry',
- 'math.DS': 'Dynamical Systems',
- 'math.FA': 'Functional Analysis',
- 'math.GM': 'General Mathematics',
- 'math.GN': 'General Topology',
- 'math.GR': 'Group Theory',
- 'math.GT': 'Geometric Topology',
- 'math.HO': 'History and Overview',
- 'math.IT': 'Information Theory',
- 'math.KT': 'K-Theory and Homology',
- 'math.LO': 'Logic',
- 'math.MG': 'Metric Geometry',
- 'math.MP': 'Mathematical Physics',
- 'math.NA': 'Numerical Analysis',
- 'math.NT': 'Number Theory',
- 'math.OA': 'Operator Algebras',
- 'math.OC': 'Optimization and Control',
- 'math.PR': 'Probability',
- 'math.QA': 'Quantum Algebra',
- 'math.RA': 'Rings and Algebras',
- 'math.RT': 'Representation Theory',
- 'math.SG': 'Symplectic Geometry',
- 'math.SP': 'Spectral Theory',
- 'math.ST': 'Statistics Theory',
- 'math-ph': 'Mathematical Physics',
- 'nlin.AO': 'Adaptation and Self-Organizing Systems',
- 'nlin.CD': 'Chaotic Dynamics',
- 'nlin.CG': 'Cellular Automata and Lattice Gases',
- 'nlin.PS': 'Pattern Formation and Solitons',
- 'nlin.SI': 'Exactly Solvable and Integrable Systems',
- 'nucl-ex': 'Nuclear Experiment',
- 'nucl-th': 'Nuclear Theory',
- 'physics.acc-ph': 'Accelerator Physics',
- 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
- 'physics.app-ph': 'Applied Physics',
- 'physics.atm-clus': 'Atomic and Molecular Clusters',
- 'physics.atom-ph': 'Atomic Physics',
- 'physics.bio-ph': 'Biological Physics',
- 'physics.chem-ph': 'Chemical Physics',
- 'physics.class-ph': 'Classical Physics',
- 'physics.comp-ph': 'Computational Physics',
- 'physics.data-an': 'Data Analysis, Statistics and Probability',
- 'physics.ed-ph': 'Physics Education',
- 'physics.flu-dyn': 'Fluid Dynamics',
- 'physics.gen-ph': 'General Physics',
- 'physics.geo-ph': 'Geophysics',
- 'physics.hist-ph': 'History and Philosophy of Physics',
- 'physics.ins-det': 'Instrumentation and Detectors',
- 'physics.med-ph': 'Medical Physics',
- 'physics.optics': 'Optics',
- 'physics.plasm-ph': 'Plasma Physics',
- 'physics.pop-ph': 'Popular Physics',
- 'physics.soc-ph': 'Physics and Society',
- 'physics.space-ph': 'Space Physics',
- 'q-bio.BM': 'Biomolecules',
- 'q-bio.CB': 'Cell Behavior',
- 'q-bio.GN': 'Genomics',
- 'q-bio.MN': 'Molecular Networks',
- 'q-bio.NC': 'Neurons and Cognition',
- 'q-bio.OT': 'Other Quantitative Biology',
- 'q-bio.PE': 'Populations and Evolution',
- 'q-bio.QM': 'Quantitative Methods',
- 'q-bio.SC': 'Subcellular Processes',
- 'q-bio.TO': 'Tissues and Organs',
- 'q-fin.CP': 'Computational Finance',
- 'q-fin.EC': 'Economics',
- 'q-fin.GN': 'General Finance',
- 'q-fin.MF': 'Mathematical Finance',
- 'q-fin.PM': 'Portfolio Management',
- 'q-fin.PR': 'Pricing of Securities',
- 'q-fin.RM': 'Risk Management',
- 'q-fin.ST': 'Statistical Finance',
- 'q-fin.TR': 'Trading and Market Microstructure',
- 'quant-ph': 'Quantum Physics',
- 'stat.AP': 'Applications',
- 'stat.CO': 'Computation',
- 'stat.ME': 'Methodology',
- 'stat.ML': 'Machine Learning',
- 'stat.OT': 'Other Statistics',
- 'stat.TH': 'Statistics Theory'
-}
\ No newline at end of file
diff --git a/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb b/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb
new file mode 100644
index 0000000..ef1bfea
--- /dev/null
+++ b/data/embeddings/arxiv-embeddings-with-distilroberta.ipynb
@@ -0,0 +1,716 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "fd6ed5af",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "51dbaff8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n",
+ "You should consider upgrading via the '/home/jovyan/workspace/untitled1-vector-search/.venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
+ "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "cc20c14a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "import re\n",
+ "import string\n",
+ "\n",
+ "from vecsim_app.embeddings import Embeddings\n",
+ "from vecsim_app.data_utils import papers\n",
+ "\n",
+ "\n",
+ "DATA_PATH = \"../arxiv-metadata-oai-snapshot.json\"\n",
+ "YEAR_CUTOFF = 2012\n",
+ "YEAR_PATTERN = r\"(19|20[0-9]{2})\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "04abead5-2567-47ed-ac51-abb10ca4b4c3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "408773"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))\n",
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "aee130cd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "169.84534547683685"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Avg length of the abstracts\n",
+ "# df.abstract.apply(lambda a: len(a.split())).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a1313d8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " title \n",
+ " year \n",
+ " authors \n",
+ " categories \n",
+ " abstract \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0704.0304 \n",
+ " The World as Evolving Information \n",
+ " 2012 \n",
+ " Carlos Gershenson \n",
+ " cs.IT,cs.AI,math.IT,q-bio.PE \n",
+ " This paper discusses the benefits of describ... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0704.2744 \n",
+ " Nahm transform and parabolic minimal Laplace t... \n",
+ " 2012 \n",
+ " Szilard Szabo \n",
+ " math.AG \n",
+ " We prove that Nahm transform for integrable ... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0704.2768 \n",
+ " Heat Equations and the Weighted $\\bar\\partial$... \n",
+ " 2012 \n",
+ " Andrew Raich \n",
+ " math.AP,math.CV \n",
+ " The purpose of this article is to establish ... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "0 0704.0304 The World as Evolving Information 2012 \n",
+ "1 0704.2744 Nahm transform and parabolic minimal Laplace t... 2012 \n",
+ "2 0704.2768 Heat Equations and the Weighted $\\bar\\partial$... 2012 \n",
+ "\n",
+ " authors categories \\\n",
+ "0 Carlos Gershenson cs.IT,cs.AI,math.IT,q-bio.PE \n",
+ "1 Szilard Szabo math.AG \n",
+ "2 Andrew Raich math.AP,math.CV \n",
+ "\n",
+ " abstract \n",
+ "0 This paper discusses the benefits of describ... \n",
+ "1 We prove that Nahm transform for integrable ... \n",
+ "2 The purpose of this article is to establish ... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f295cc33",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Carlos Gershenson\n",
+ "1 Szilard Szabo\n",
+ "2 Andrew Raich\n",
+ "Name: authors_clean, dtype: object"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['authors_clean'] = df['authors'].apply(lambda a: ' '.join(re.findall(r'\\w\\w+', a)).strip())\n",
+ "df['authors_clean'][:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6336293d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 the world as evolving information this paper ...\n",
+ "Name: text, dtype: object"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['text'] = df.apply(lambda r: Embeddings.clean_description(r['title'] + ' ' + r['abstract']), axis=1)\n",
+ "df['text'][:1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ef747be9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0ffb374860f84975aaf2d23ceb24f5e1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/737 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9e014103c6694293b415e0b3cdb0ea95",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/190 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "913b798057c244bda8aa17e2ed0b3f4a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/10.3k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8fa9778d5d4d4d3da9a22c1a84686f70",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/653 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b1d5e1af1eaa49fa8a9c8af7de10210d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/116 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "54f8f28f4b3c4e7b84a138aaf3834518",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/15.7k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "905d38ab70f740a9ba6c54b4688aafc4",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/456k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "09a4b76c4aba446ba253cc216aff6d32",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/329M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ab0c35da5dd44338b9235b01e3a89669",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/53.0 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b00b3d9443e74d92abef2bc261a70376",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/239 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "46fc8939512842fd91240504fee5980f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/1.36M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c78d30917658427b8268587d11704674",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/333 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d6deee3f52da42e8a6b1fe61c251c67d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/13.1k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0336b5ca686b470f85a7a6eef77373b8",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/798k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "64863682a0fd41f3b077f6e310de5ff4",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading: 0%| | 0.00/349 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sentence_transformers import SentenceTransformer\n",
+ "#see models here: https://www.sbert.net/docs/pretrained_models.html\n",
+ "model_name = 'sentence-transformers/all-distilroberta-v1'\n",
+ "\n",
+ "model = SentenceTransformer(model_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "249ad360",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create embeddings from the title and abstract\n",
+ "emb = model.encode(df['text'].tolist())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "f7b4974a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add embeddings to df\n",
+ "df = df.reset_index().drop('index', axis=1)\n",
+ "df['vector'] = emb.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "b71b9a2b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(408773, 9)"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "ab41fe6c-5620-489f-9004-cb0fe0b094cb",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import pickle\n",
+ "\n",
+ "# Export to file!\n",
+ "with open(f'arxiv_embeddings_10000.pkl', 'wb') as f:\n",
+ " data = pickle.dumps(df)\n",
+ " f.write(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "4df5b7c8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+ "total 106M\n",
+ "-rw-rw-r-- 1 jovyan jovyan 106M Nov 6 05:23 arxiv_embeddings_10000.pkl\n",
+ "-rw-rw-r-- 1 jovyan jovyan 16K Nov 6 05:23 arxiv-embeddings.ipynb\n",
+ "-rw-rw-r-- 1 jovyan jovyan 1.9K Nov 5 23:59 requirements.txt\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls -lh ."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7969cbea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embeddings = Embeddings()\n",
+ "\n",
+ "e1 = embeddings.make(\"text1\")\n",
+ "e2 = embeddings.make(\"text2\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "12ac859b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0.0134814 , -0.02945524, -0.0014616 ], dtype=float32)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "e1[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "19efce03",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0.0119531 , -0.05998396, -0.0344477 ], dtype=float32)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "e2[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "c22da504",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 0.01271725, -0.0447196 , -0.01795465], dtype=float32)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "((e1+e2)/2)[:3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "54687037",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.013099324999999998"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(0.0134814+0.01271725)/2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "8429825c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.0447196"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(0.02945524+0.05998396)/2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "92b6abfa",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.9.12 64-bit",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/arxiv-embeddings.ipynb b/data/embeddings/arxiv-embeddings.ipynb
similarity index 100%
rename from data/arxiv-embeddings.ipynb
rename to data/embeddings/arxiv-embeddings.ipynb
diff --git a/data/multi-gpu-arxiv-embeddings.ipynb b/data/embeddings/multi-gpu-arxiv-embeddings.ipynb
similarity index 100%
rename from data/multi-gpu-arxiv-embeddings.ipynb
rename to data/embeddings/multi-gpu-arxiv-embeddings.ipynb
diff --git a/data/single-gpu-arxiv-embeddings.ipynb b/data/embeddings/single-gpu-arxiv-embeddings.ipynb
similarity index 100%
rename from data/single-gpu-arxiv-embeddings.ipynb
rename to data/embeddings/single-gpu-arxiv-embeddings.ipynb
diff --git a/data/multilabel_classifier/.gitignore b/data/multilabel_classifier/.gitignore
new file mode 100644
index 0000000..5f8aad6
--- /dev/null
+++ b/data/multilabel_classifier/.gitignore
@@ -0,0 +1,2 @@
+# checkpoints:
+paper-multilabel-finetuning/
diff --git a/data/multilabel_classifier/checkpoint/model_info.json b/data/multilabel_classifier/checkpoint/model_info.json
new file mode 100644
index 0000000..9021b32
--- /dev/null
+++ b/data/multilabel_classifier/checkpoint/model_info.json
@@ -0,0 +1,10 @@
+{
+ "eval_loss": 0.6576664447784424,
+ "eval_f1": 0.026587887740029542,
+ "eval_roc_auc": 0.5024737713970182,
+ "eval_accuracy": 0.0,
+ "eval_runtime": 0.6499,
+ "eval_samples_per_second": 18.465,
+ "eval_steps_per_second": 4.616,
+ "epoch": 1.0
+}
\ No newline at end of file
diff --git a/data/multilabel_classifier/multilabel-inference.ipynb b/data/multilabel_classifier/multilabel-inference.ipynb
new file mode 100644
index 0000000..14f7f40
--- /dev/null
+++ b/data/multilabel_classifier/multilabel-inference.ipynb
@@ -0,0 +1,526 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q -e ../../backend"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "from transformers import BertForSequenceClassification\n",
+ "from sklearn.preprocessing import MultiLabelBinarizer\n",
+ "from transformers import AutoTokenizer\n",
+ "import pickle\n",
+ " \n",
+ "from vecsim_app.categories import CATEGORIES\n",
+ "from vecsim_app.multilabel_classifier.inference import predict_categories, load_models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model, tokenizer, mlb = load_models('checkpoint/', 'checkpoint/mlb.pkl')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query = \"\"\"\n",
+ "The World as Evolving Information This paper discusses the benefits of describing the world as information,\\nespecially in the study of the evolution of life and cognition. Traditional\\nstudies encounter problems because it is difficult to describe life and\\ncognition in terms of matter and energy, since their laws are valid only at the\\nphysical scale. However, if matter and energy, as well as life and cognition,\\nare described in terms of information, evolution can be described consistently\\nas information becoming more complex.\\n The paper presents eight tentative laws of information, valid at multiple\\nscales, which are generalizations of Darwinian, cybernetic, thermodynamic,\\npsychological, philosophical, and complexity principles. These are further used\\nto discuss the notions of life, cognition and their evolution.\\n\n",
+ "\"\"\".lower()\n",
+ "\n",
+ "queries = [query]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['astro-ph',\n",
+ " 'astro-ph.CO',\n",
+ " 'astro-ph.EP',\n",
+ " 'astro-ph.GA',\n",
+ " 'astro-ph.HE',\n",
+ " 'astro-ph.IM',\n",
+ " 'astro-ph.SR',\n",
+ " 'cond-mat.dis-nn',\n",
+ " 'cond-mat.mes-hall',\n",
+ " 'cond-mat.mtrl-sci',\n",
+ " 'cond-mat.other',\n",
+ " 'cond-mat.quant-gas',\n",
+ " 'cond-mat.soft',\n",
+ " 'cond-mat.stat-mech',\n",
+ " 'cond-mat.str-el',\n",
+ " 'cond-mat.supr-con',\n",
+ " 'cs.AI',\n",
+ " 'cs.AR',\n",
+ " 'cs.CC',\n",
+ " 'cs.CE',\n",
+ " 'cs.CG',\n",
+ " 'cs.CL',\n",
+ " 'cs.CR',\n",
+ " 'cs.CV',\n",
+ " 'cs.CY',\n",
+ " 'cs.DB',\n",
+ " 'cs.DC',\n",
+ " 'cs.DL',\n",
+ " 'cs.DM',\n",
+ " 'cs.DS',\n",
+ " 'cs.ET',\n",
+ " 'cs.FL',\n",
+ " 'cs.GL',\n",
+ " 'cs.GR',\n",
+ " 'cs.GT',\n",
+ " 'cs.HC',\n",
+ " 'cs.IR',\n",
+ " 'cs.IT',\n",
+ " 'cs.LG',\n",
+ " 'cs.LO',\n",
+ " 'cs.MA',\n",
+ " 'cs.MM',\n",
+ " 'cs.MS',\n",
+ " 'cs.NA',\n",
+ " 'cs.NE',\n",
+ " 'cs.NI',\n",
+ " 'cs.OH',\n",
+ " 'cs.OS',\n",
+ " 'cs.PF',\n",
+ " 'cs.PL',\n",
+ " 'cs.RO',\n",
+ " 'cs.SC',\n",
+ " 'cs.SD',\n",
+ " 'cs.SE',\n",
+ " 'cs.SI',\n",
+ " 'cs.SY',\n",
+ " 'econ.EM',\n",
+ " 'eess.AS',\n",
+ " 'eess.IV',\n",
+ " 'eess.SP',\n",
+ " 'gr-qc',\n",
+ " 'hep-ex',\n",
+ " 'hep-lat',\n",
+ " 'hep-ph',\n",
+ " 'hep-th',\n",
+ " 'math-ph',\n",
+ " 'math.AC',\n",
+ " 'math.AG',\n",
+ " 'math.AP',\n",
+ " 'math.AT',\n",
+ " 'math.CA',\n",
+ " 'math.CO',\n",
+ " 'math.CT',\n",
+ " 'math.CV',\n",
+ " 'math.DG',\n",
+ " 'math.DS',\n",
+ " 'math.FA',\n",
+ " 'math.GM',\n",
+ " 'math.GN',\n",
+ " 'math.GR',\n",
+ " 'math.GT',\n",
+ " 'math.HO',\n",
+ " 'math.IT',\n",
+ " 'math.KT',\n",
+ " 'math.LO',\n",
+ " 'math.MG',\n",
+ " 'math.NA',\n",
+ " 'math.NT',\n",
+ " 'math.OA',\n",
+ " 'math.OC',\n",
+ " 'math.PR',\n",
+ " 'math.QA',\n",
+ " 'math.RA',\n",
+ " 'math.RT',\n",
+ " 'math.SG',\n",
+ " 'math.SP',\n",
+ " 'math.ST',\n",
+ " 'nlin.AO',\n",
+ " 'nlin.CD',\n",
+ " 'nlin.CG',\n",
+ " 'nlin.PS',\n",
+ " 'nlin.SI',\n",
+ " 'nucl-ex',\n",
+ " 'nucl-th',\n",
+ " 'physics.acc-ph',\n",
+ " 'physics.ao-ph',\n",
+ " 'physics.app-ph',\n",
+ " 'physics.atm-clus',\n",
+ " 'physics.atom-ph',\n",
+ " 'physics.bio-ph',\n",
+ " 'physics.chem-ph',\n",
+ " 'physics.class-ph',\n",
+ " 'physics.comp-ph',\n",
+ " 'physics.data-an',\n",
+ " 'physics.ed-ph',\n",
+ " 'physics.flu-dyn',\n",
+ " 'physics.gen-ph',\n",
+ " 'physics.geo-ph',\n",
+ " 'physics.hist-ph',\n",
+ " 'physics.ins-det',\n",
+ " 'physics.med-ph',\n",
+ " 'physics.optics',\n",
+ " 'physics.plasm-ph',\n",
+ " 'physics.pop-ph',\n",
+ " 'physics.soc-ph',\n",
+ " 'physics.space-ph',\n",
+ " 'q-bio.BM',\n",
+ " 'q-bio.CB',\n",
+ " 'q-bio.GN',\n",
+ " 'q-bio.MN',\n",
+ " 'q-bio.NC',\n",
+ " 'q-bio.OT',\n",
+ " 'q-bio.PE',\n",
+ " 'q-bio.QM',\n",
+ " 'q-bio.SC',\n",
+ " 'q-fin.CP',\n",
+ " 'q-fin.EC',\n",
+ " 'q-fin.GN',\n",
+ " 'q-fin.MF',\n",
+ " 'q-fin.PM',\n",
+ " 'q-fin.PR',\n",
+ " 'q-fin.RM',\n",
+ " 'q-fin.ST',\n",
+ " 'q-fin.TR',\n",
+ " 'quant-ph',\n",
+ " 'stat.AP',\n",
+ " 'stat.CO',\n",
+ " 'stat.ME',\n",
+ " 'stat.ML',\n",
+ " 'stat.OT',\n",
+ " 'stat.TH']"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "categories = predict_categories(queries, model, tokenizer, mlb, proba_threshold=0.3)\n",
+ "categories"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pairs = [['The World as Evolving Information This paper discusses the benefits of describing the world as information,\\nespecially in the study of the evolution of life and cognition. Traditional\\nstudies encounter problems because it is difficult to describe life and\\ncognition in terms of matter and energy, since their laws are valid only at the\\nphysical scale. However, if matter and energy, as well as life and cognition,\\nare described in terms of information, evolution can be described consistently\\nas information becoming more complex.\\n The paper presents eight tentative laws of information, valid at multiple\\nscales, which are generalizations of Darwinian, cybernetic, thermodynamic,\\npsychological, philosophical, and complexity principles. These are further used\\nto discuss the notions of life, cognition and their evolution.\\n',\n",
+ " 'cs.IT,cs.AI,math.IT,q-bio.PE'],\n",
+ " ['Nahm transform and parabolic minimal Laplace transform We prove that Nahm transform for integrable connections with a finite number\\nof regular singularities and an irregular singularity of rank 1 on the Riemann\\nsphere is equivalent -- up to considering integrable connections as holonomic\\n$\\\\D$-modules -- to minimal Laplace transform. We assume semi-simplicity and\\nresonance-freeness conditions, and we work in the framework of objects with a\\nparabolic structure. In particular, we describe the definition of the parabolic\\nversion of Laplace transform due to C. Sabbah. The proof of the main result\\nrelies on the study of a twisted de Rham complex.\\n',\n",
+ " 'math.AG'],\n",
+ " [\"Heat Equations and the Weighted $\\\\bar\\\\partial$-Problem The purpose of this article is to establish regularity and pointwise upper\\nbounds for the (relative) fundamental solution of the heat equation associated\\nto the weighted dbar-operator in $L^2(C^n)$ for a certain class of weights. The\\nweights depend on a parameter, and we find pointwise bounds for heat kernel, as\\nwell as its derivatives in time, space, and the parameter. We also prove\\ncancellation conditions for the heat semigroup. We reduce the $n$-dimensional\\ncase to the one-dimensional case, and the estimates in one-dimensional case are\\nachieved by Duhamel's principle and commutator properties of the operators. As\\nan application, we recover estimates of heat kernels on polynomial models in\\n$C^2$.\\n\",\n",
+ " 'math.AP,math.CV'],\n",
+ " ['Characterization of polynomials In 1954 it was proved if f is infinitely differentiable in the interval I and\\nsome derivative (of order depending on x) vanishes at each x, then f is a\\npolynomial. Later it was generalized for multi-variable case. In this paper we\\ngive an extension for distributions.\\n',\n",
+ " 'math.AP,math.FA'],\n",
+ " ['Groups of diffeomorphisms and geometric loops of manifolds over\\n ultra-normed fields The article is devoted to the investigation of groups of diffeomorphisms and\\nloops of manifolds over ultra-metric fields of zero and positive\\ncharacteristics. Different types of topologies are considered on groups of\\nloops and diffeomorphisms relative to which they are generalized Lie groups or\\ntopological groups. Among such topologies pairwise incomparable are found as\\nwell. Topological perfectness of the diffeomorphism group relative to certain\\ntopologies is studied. There are proved theorems about projective limit\\ndecompositions of these groups and their compactifications for compact\\nmanifolds. Moreover, an existence of one-parameter local subgroups of\\ndiffeomorphism groups is investigated.\\n',\n",
+ " 'math.GR,math.FA'],\n",
+ " ['Suppression of growth by multiplicative white noise in a parametric\\n resonant system The author studied the growth of the amplitude in a Mathieu-like equation\\nwith multiplicative white noise. The approximate value of the exponent at the\\nextremum on parametric resonance regions was obtained theoretically by\\nintroducing the width of time interval, and the exponents were calculated\\nnumerically by solving the stochastic differential equations by a symplectic\\nnumerical method. The Mathieu-like equation contains a parameter $\\\\alpha$ that\\nis determined by the intensity of noise and the strength of the coupling\\nbetween the variable and the noise. The value of $\\\\alpha$ was restricted not to\\nbe negative without loss of generality. It was shown that the exponent\\ndecreases with $\\\\alpha$, reaches a minimum and increases after that. It was\\nalso found that the exponent as a function of $\\\\alpha$ has only one minimum at\\n$\\\\alpha \\\\neq 0$ on parametric resonance regions of $\\\\alpha = 0$. This minimum\\nvalue is obtained theoretically and numerically. The existence of the minimum\\nat $\\\\alpha \\\\neq 0$ indicates the suppression of the growth by multiplicative\\nwhite noise.\\n',\n",
+ " 'cond-mat.stat-mech,cond-mat.other'],\n",
+ " ['Nonholonomic Algebroids, Finsler Geometry, and Lagrange-Hamilton Spaces We elaborate an unified geometric approach to classical mechanics,\\nRiemann-Finsler spaces and gravity theories on Lie algebroids provided with\\nnonlinear connection (N-connection) structure. There are investigated the\\nconditions when the fundamental geometric objects like the anchor, metric and\\nlinear connection, almost sympletic and related almost complex structures may\\nbe canonically defined by a N-connection induced from a regular Lagrangian (or\\nHamiltonian), in mechanical models, or by generic off-diagonal metric terms and\\nnonholonomic frames, in gravity theories. Such geometric constructions are\\nmodelled on nonholonomic manifolds provided with nonintegrable distributions\\nand related chains of exact sequences of submanifolds defining N-connections.\\nWe investigate the main properties of the Lagrange, Hamilton, Finsler-Riemann\\nand Einstein-Cartan algebroids and construct and analyze exact solutions\\ndescribing such objects.\\n',\n",
+ " 'math-ph,gr-qc,hep-th,math.DG,math.MP'],\n",
+ " ['Stability of the periodic Toda lattice under short range perturbations We consider the stability of the periodic Toda lattice (and slightly more\\ngenerally of the algebro-geometric finite-gap lattice) under a short range\\nperturbation. We prove that the perturbed lattice asymptotically approaches a\\nmodulated lattice.\\n More precisely, let $g$ be the genus of the hyperelliptic curve associated\\nwith the unperturbed solution. We show that, apart from the phenomenon of the\\nsolitons travelling on the quasi-periodic background, the $n/t$-pane contains\\n$g+2$ areas where the perturbed solution is close to a finite-gap solution in\\nthe same isospectral torus. In between there are $g+1$ regions where the\\nperturbed solution is asymptotically close to a modulated lattice which\\nundergoes a continuous phase transition (in the Jacobian variety) and which\\ninterpolates between these isospectral solutions. In the special case of the\\nfree lattice ($g=0$) the isospectral torus consists of just one point and we\\nrecover the known result.\\n Both the solutions in the isospectral torus and the phase transition are\\nexplicitly characterized in terms of Abelian integrals on the underlying\\nhyperelliptic curve.\\n Our method relies on the equivalence of the inverse spectral problem to a\\nmatrix Riemann--Hilbert problem defined on the hyperelliptic curve and\\ngeneralizes the so-called nonlinear stationary phase/steepest descent method\\nfor Riemann--Hilbert problem deformations to Riemann surfaces.\\n',\n",
+ " 'nlin.SI,math-ph,math.MP'],\n",
+ " ['Non Supersymmetric Metastable Vacua in N=2 SYM Softly Broken to N=1 We find non-supersymmetric metastable vacua in four dimensional N=2 gauge\\ntheories softly broken to N=1 by a superpotential term. First we study the\\nsimplest case, namely the SU(2) gauge theory without flavors. We study the\\nspectrum and lifetime of the metastable vacuum and possible embeddings of the\\nmodel in UV complete theories. Then we consider larger gauge group theories\\nwith flavors. We show that when we softly break them to N=1, the potential\\ninduced on specific submanifolds of their moduli space is identical to the\\npotential in lower rank gauge theories. Then we show that the potential\\nincreases when we move away from this submanifold, allowing us to construct\\nmetastable vacua on them in the theories that can be reduced to the SU(2) case.\\n',\n",
+ " 'hep-th'],\n",
+ " ['Strong Stein neighborhood bases Let D be a smooth bounded pseudoconvex domain in C^n. We give several\\ncharacterizations for the closure of D to have a strong Stein neighborhood\\nbasis in the sense that D has a defining function r such that {z\\\\in C^n:r(z)0. We also show that this condition is\\ninvariant under proper holomorphic maps that extend smoothly to the boundary.\\n',\n",
+ " 'math.CV']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Artificial Intelligence', 'Information Theory', 'Information Theory', 'Populations and Evolution']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Algebraic Geometry']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Materials Science', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Analysis of PDEs', 'Complex Variables']\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Analysis of PDEs', 'Functional Analysis']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Functional Analysis', 'Group Theory']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Other Condensed Matter', 'Statistical Mechanics']\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "actual ['Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['General Relativity and Quantum Cosmology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Differential Geometry', 'Mathematical Physics']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Materials Science', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Quantum Algebra', 'Rings and Algebras', 'Representation Theory', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Mathematical Physics', 'Mathematical Physics', 'Exactly Solvable and Integrable Systems']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['High Energy Physics - Theory']\n",
+ "\n",
+ "actual ['Astrophysics', 'Cosmology and Nongalactic Astrophysics', 'Earth and Planetary Astrophysics', 'Astrophysics of Galaxies', 'High Energy Astrophysical Phenomena', 'Instrumentation and Methods for Astrophysics', 'Solar and Stellar Astrophysics', 'Disordered Systems and Neural Networks', 'Mesoscale and Nanoscale Physics', 'Other Condensed Matter', 'Quantum Gases', 'Soft Condensed Matter', 'Statistical Mechanics', 'Strongly Correlated Electrons', 'Superconductivity', 'Artificial Intelligence', 'Hardware Architecture', 'Computational Complexity', 'Computational Engineering, Finance, and Science', 'Computational Geometry', 'Computation and Language', 'Cryptography and Security', 'Computer Vision and Pattern Recognition', 'Computers and Society', 'Databases', 'Distributed, Parallel, and Cluster Computing', 'Digital Libraries', 'Discrete Mathematics', 'Data Structures and Algorithms', 'Emerging Technologies', 'Formal Languages and Automata Theory', 'General Literature', 'Graphics', 'Computer Science and Game Theory', 'Human-Computer Interaction', 'Information Retrieval', 'Information Theory', 'Machine Learning', 'Logic in Computer Science', 'Multiagent Systems', 'Multimedia', 'Mathematical Software', 'Numerical Analysis', 'Neural and Evolutionary Computing', 'Networking and Internet Architecture', 'Other Computer Science', 'Operating Systems', 'Performance', 'Programming Languages', 'Robotics', 'Symbolic Computation', 'Sound', 'Software Engineering', 'Social and Information Networks', 'Systems and Control', 'Econometrics', 'Audio and Speech Processing', 'Image and Video Processing', 'Signal Processing', 'General Relativity and Quantum Cosmology', 'High Energy Physics - Experiment', 'High Energy Physics - Lattice', 'High Energy Physics - Phenomenology', 'High Energy Physics - Theory', 'Mathematical Physics', 'Commutative Algebra', 'Algebraic Geometry', 'Analysis of PDEs', 'Algebraic Topology', 'Classical Analysis and ODEs', 'Combinatorics', 'Category Theory', 'Complex Variables', 'Differential Geometry', 'Dynamical Systems', 'Functional Analysis', 'General Mathematics', 'General Topology', 'Group Theory', 'Geometric Topology', 'History and Overview', 'Information Theory', 'K-Theory and Homology', 'Logic', 'Metric Geometry', 'Mathematical Physics', 'Numerical Analysis', 'Number Theory', 'Operator Algebras', 'Optimization and Control', 'Probability', 'Quantum Algebra', 'Rings and Algebras', 'Symplectic Geometry', 'Spectral Theory', 'Statistics Theory', 'Adaptation and Self-Organizing Systems', 'Chaotic Dynamics', 'Cellular Automata and Lattice Gases', 'Pattern Formation and Solitons', 'Exactly Solvable and Integrable Systems', 'Nuclear Experiment', 'Nuclear Theory', 'Accelerator Physics', 'Atmospheric and Oceanic Physics', 'Applied Physics', 'Atomic and Molecular Clusters', 'Atomic Physics', 'Biological Physics', 'Chemical Physics', 'Classical Physics', 'Computational Physics', 'Data Analysis, Statistics and Probability', 'Physics Education', 'Fluid Dynamics', 'General Physics', 'Geophysics', 'History and Philosophy of Physics', 'Instrumentation and Detectors', 'Medical Physics', 'Optics', 'Plasma Physics', 'Popular Physics', 'Physics and Society', 'Space Physics', 'Biomolecules', 'Cell Behavior', 'Genomics', 'Molecular Networks', 'Neurons and Cognition', 'Other Quantitative Biology', 'Populations and Evolution', 'Quantitative Methods', 'Subcellular Processes', 'Computational Finance', 'Economics', 'General Finance', 'Mathematical Finance', 'Portfolio Management', 'Pricing of Securities', 'Risk Management', 'Statistical Finance', 'Trading and Market Microstructure', 'Quantum Physics', 'Applications', 'Computation', 'Methodology', 'Machine Learning', 'Other Statistics', 'Statistics Theory']\n",
+ "expected ['Complex Variables']\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "for text, expected_categories in pairs:\n",
+ " actual = predict_categories([text], model, tokenizer, mlb, proba_threshold=0.35)\n",
+ " print('actual', [CATEGORIES[c] for c in actual])\n",
+ " print('expected', [CATEGORIES[c] for c in sorted(expected_categories.split(','))])\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['astro-ph',\n",
+ " 'astro-ph.CO',\n",
+ " 'astro-ph.EP',\n",
+ " 'astro-ph.GA',\n",
+ " 'astro-ph.HE',\n",
+ " 'astro-ph.IM',\n",
+ " 'astro-ph.SR',\n",
+ " 'cond-mat.dis-nn',\n",
+ " 'cond-mat.mes-hall',\n",
+ " 'cond-mat.other',\n",
+ " 'cond-mat.quant-gas',\n",
+ " 'cond-mat.soft',\n",
+ " 'cond-mat.stat-mech',\n",
+ " 'cond-mat.str-el',\n",
+ " 'cond-mat.supr-con',\n",
+ " 'cs.AI',\n",
+ " 'cs.AR',\n",
+ " 'cs.CC',\n",
+ " 'cs.CE',\n",
+ " 'cs.CG',\n",
+ " 'cs.CL',\n",
+ " 'cs.CR',\n",
+ " 'cs.CV',\n",
+ " 'cs.CY',\n",
+ " 'cs.DB',\n",
+ " 'cs.DC',\n",
+ " 'cs.DL',\n",
+ " 'cs.DM',\n",
+ " 'cs.DS',\n",
+ " 'cs.ET',\n",
+ " 'cs.FL',\n",
+ " 'cs.GL',\n",
+ " 'cs.GR',\n",
+ " 'cs.GT',\n",
+ " 'cs.HC',\n",
+ " 'cs.IR',\n",
+ " 'cs.IT',\n",
+ " 'cs.LG',\n",
+ " 'cs.LO',\n",
+ " 'cs.MA',\n",
+ " 'cs.MM',\n",
+ " 'cs.MS',\n",
+ " 'cs.NA',\n",
+ " 'cs.NE',\n",
+ " 'cs.NI',\n",
+ " 'cs.OH',\n",
+ " 'cs.OS',\n",
+ " 'cs.PF',\n",
+ " 'cs.PL',\n",
+ " 'cs.RO',\n",
+ " 'cs.SC',\n",
+ " 'cs.SD',\n",
+ " 'cs.SE',\n",
+ " 'cs.SI',\n",
+ " 'cs.SY',\n",
+ " 'econ.EM',\n",
+ " 'eess.AS',\n",
+ " 'eess.IV',\n",
+ " 'eess.SP',\n",
+ " 'gr-qc',\n",
+ " 'hep-ex',\n",
+ " 'hep-lat',\n",
+ " 'hep-ph',\n",
+ " 'hep-th',\n",
+ " 'math-ph',\n",
+ " 'math.AC',\n",
+ " 'math.AG',\n",
+ " 'math.AP',\n",
+ " 'math.AT',\n",
+ " 'math.CA',\n",
+ " 'math.CO',\n",
+ " 'math.CT',\n",
+ " 'math.CV',\n",
+ " 'math.DG',\n",
+ " 'math.DS',\n",
+ " 'math.FA',\n",
+ " 'math.GM',\n",
+ " 'math.GN',\n",
+ " 'math.GR',\n",
+ " 'math.GT',\n",
+ " 'math.HO',\n",
+ " 'math.IT',\n",
+ " 'math.KT',\n",
+ " 'math.LO',\n",
+ " 'math.MG',\n",
+ " 'math.MP',\n",
+ " 'math.NA',\n",
+ " 'math.NT',\n",
+ " 'math.OA',\n",
+ " 'math.OC',\n",
+ " 'math.PR',\n",
+ " 'math.QA',\n",
+ " 'math.RA',\n",
+ " 'math.SG',\n",
+ " 'math.SP',\n",
+ " 'math.ST',\n",
+ " 'nlin.AO',\n",
+ " 'nlin.CD',\n",
+ " 'nlin.CG',\n",
+ " 'nlin.PS',\n",
+ " 'nlin.SI',\n",
+ " 'nucl-ex',\n",
+ " 'nucl-th',\n",
+ " 'physics.acc-ph',\n",
+ " 'physics.ao-ph',\n",
+ " 'physics.app-ph',\n",
+ " 'physics.atm-clus',\n",
+ " 'physics.atom-ph',\n",
+ " 'physics.bio-ph',\n",
+ " 'physics.chem-ph',\n",
+ " 'physics.class-ph',\n",
+ " 'physics.comp-ph',\n",
+ " 'physics.data-an',\n",
+ " 'physics.ed-ph',\n",
+ " 'physics.flu-dyn',\n",
+ " 'physics.gen-ph',\n",
+ " 'physics.geo-ph',\n",
+ " 'physics.hist-ph',\n",
+ " 'physics.ins-det',\n",
+ " 'physics.med-ph',\n",
+ " 'physics.optics',\n",
+ " 'physics.plasm-ph',\n",
+ " 'physics.pop-ph',\n",
+ " 'physics.soc-ph',\n",
+ " 'physics.space-ph',\n",
+ " 'q-bio.BM',\n",
+ " 'q-bio.CB',\n",
+ " 'q-bio.GN',\n",
+ " 'q-bio.MN',\n",
+ " 'q-bio.NC',\n",
+ " 'q-bio.OT',\n",
+ " 'q-bio.PE',\n",
+ " 'q-bio.QM',\n",
+ " 'q-bio.SC',\n",
+ " 'q-fin.CP',\n",
+ " 'q-fin.EC',\n",
+ " 'q-fin.GN',\n",
+ " 'q-fin.MF',\n",
+ " 'q-fin.PM',\n",
+ " 'q-fin.PR',\n",
+ " 'q-fin.RM',\n",
+ " 'q-fin.ST',\n",
+ " 'q-fin.TR',\n",
+ " 'quant-ph',\n",
+ " 'stat.AP',\n",
+ " 'stat.CO',\n",
+ " 'stat.ME',\n",
+ " 'stat.ML',\n",
+ " 'stat.OT',\n",
+ " 'stat.TH']"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "actual"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.16"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data/multilabel_classifier/multilabel-model.ipynb b/data/multilabel_classifier/multilabel-model.ipynb
new file mode 100644
index 0000000..0a257ef
--- /dev/null
+++ b/data/multilabel_classifier/multilabel-model.ipynb
@@ -0,0 +1,1264 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q -r requirements.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "import re\n",
+ "import string\n",
+ "import pickle\n",
+ "\n",
+ "from transformers import BertForSequenceClassification\n",
+ "from sklearn.preprocessing import MultiLabelBinarizer\n",
+ "from transformers import AutoTokenizer\n",
+ "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n",
+ "from transformers import EvalPrediction\n",
+ "import torch\n",
+ "from transformers import TrainingArguments, Trainer\n",
+ "from datasets import Dataset\n",
+ "import numpy as np\n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "#os.chdir('../..')\n",
+ "\n",
+ "from vecsim_app.categories import CATEGORIES\n",
+ "from vecsim_app.data_utils import papers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Define the parameters for fetching the papers dataset:\n",
+ "\n",
+ "- Dataset Path\n",
+ "- Year cutoff: Year cut off for the papers.\n",
+ "- Pattern for fetching a given amount of years\n",
+ "- Max Sample size: maximum simple size (if you just want to try out the notebook - if it's too low the model won't perform well)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_PATH = \"./arxiv-metadata-oai-snapshot.json\"\n",
+ "YEAR_CUTOFF = 2010\n",
+ "YEAR_PATTERN = r\"(19|20[0-9]{2})\"\n",
+ "MAX_SAMPLE_SIZE = 20000\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN))\n",
+ "len(df)\n",
+ "\n",
+ "# Take a sample for computing reasons\n",
+ "df = df.sample(MAX_SAMPLE_SIZE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " title \n",
+ " year \n",
+ " authors \n",
+ " categories \n",
+ " abstract \n",
+ " text \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 210240 \n",
+ " 1512.07410 \n",
+ " Fragmentation of long-lived hydrocarbons after... \n",
+ " 2016 \n",
+ " Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ " physics.chem-ph,physics.atm-clus,physics.optics \n",
+ " We experimentally and theoretically investig... \n",
+ " Fragmentation of long-lived hydrocarbons after... \n",
+ " \n",
+ " \n",
+ " 148715 \n",
+ " 1404.1520 \n",
+ " Single spin stochastic optical reconstruction ... \n",
+ " 2014 \n",
+ " Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ " quant-ph,physics.optics \n",
+ " We experimentally demonstrate precision addr... \n",
+ " Single spin stochastic optical reconstruction ... \n",
+ " \n",
+ " \n",
+ " 219516 \n",
+ " 1603.07790 \n",
+ " Weighted Pushdown Systems with Indexed Weight ... \n",
+ " 2016 \n",
+ " Yasuhiko Minamide \n",
+ " cs.FL,cs.PL \n",
+ " The reachability analysis of weighted pushdo... \n",
+ " Weighted Pushdown Systems with Indexed Weight ... \n",
+ " \n",
+ " \n",
+ " 370814 \n",
+ " 1911.02005 \n",
+ " Simultaneous spectral estimation of dephasing ... \n",
+ " 2020 \n",
+ " Virginia Frey, Leigh M. Norris, Lorenza Viola ... \n",
+ " quant-ph \n",
+ " The fragility of quantum systems makes them ... \n",
+ " Simultaneous spectral estimation of dephasing ... \n",
+ " \n",
+ " \n",
+ " 228464 \n",
+ " 1606.06192 \n",
+ " A Novel Quasi-One-Dimensional Topological Insu... \n",
+ " 2016 \n",
+ " Gabriel Aut\\`es, Anna Isaeva, Luca Moreschini,... \n",
+ " cond-mat.mtrl-sci,cond-mat.mes-hall \n",
+ " Recent progress in the field of topological ... \n",
+ " A Novel Quasi-One-Dimensional Topological Insu... \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 316199 \n",
+ " 1808.06472 \n",
+ " Dark Matter Sommerfeld-enhanced annihilation a... \n",
+ " 2018 \n",
+ " Tobias Binder, Laura Covi and Kyohei Mukaida \n",
+ " hep-ph,astro-ph.CO,hep-th \n",
+ " Traditional computations of the dark matter ... \n",
+ " Dark Matter Sommerfeld-enhanced annihilation a... \n",
+ " \n",
+ " \n",
+ " 414998 \n",
+ " 2010.12385 \n",
+ " Resonances in hyperbolic dynamics \n",
+ " 2018 \n",
+ " St\\'ephane Nonnenmacher \n",
+ " math-ph,math.DS,math.MP,math.SP \n",
+ " The study of wave propagation outside bounde... \n",
+ " Resonances in hyperbolic dynamics The study ... \n",
+ " \n",
+ " \n",
+ " 189313 \n",
+ " 1506.01307 \n",
+ " Control of fixed points and existence and uniq... \n",
+ " 2016 \n",
+ " George Glauberman and Justin Lynd \n",
+ " math.GR,math.AT \n",
+ " A. Chermak has recently proved that to each ... \n",
+ " Control of fixed points and existence and uniq... \n",
+ " \n",
+ " \n",
+ " 65915 \n",
+ " 1108.5137 \n",
+ " A Laser System for the Spectroscopy of Highly-... \n",
+ " 2012 \n",
+ " S. Albrecht, S. Altenburg, C. Siegel, N. Hersc... \n",
+ " physics.atom-ph,nucl-ex \n",
+ " We present and characterize a laser system f... \n",
+ " A Laser System for the Spectroscopy of Highly-... \n",
+ " \n",
+ " \n",
+ " 319340 \n",
+ " 1809.05654 \n",
+ " Changes of graph structure of transition proba... \n",
+ " 2018 \n",
+ " Teruaki Okushima, Tomoaki Niiyama, Kensuke S. ... \n",
+ " cond-mat.dis-nn \n",
+ " Graphs of the most probable transitions for ... \n",
+ " Changes of graph structure of transition proba... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "210240 1512.07410 Fragmentation of long-lived hydrocarbons after... 2016 \n",
+ "148715 1404.1520 Single spin stochastic optical reconstruction ... 2014 \n",
+ "219516 1603.07790 Weighted Pushdown Systems with Indexed Weight ... 2016 \n",
+ "370814 1911.02005 Simultaneous spectral estimation of dephasing ... 2020 \n",
+ "228464 1606.06192 A Novel Quasi-One-Dimensional Topological Insu... 2016 \n",
+ "... ... ... ... \n",
+ "316199 1808.06472 Dark Matter Sommerfeld-enhanced annihilation a... 2018 \n",
+ "414998 2010.12385 Resonances in hyperbolic dynamics 2018 \n",
+ "189313 1506.01307 Control of fixed points and existence and uniq... 2016 \n",
+ "65915 1108.5137 A Laser System for the Spectroscopy of Highly-... 2012 \n",
+ "319340 1809.05654 Changes of graph structure of transition proba... 2018 \n",
+ "\n",
+ " authors \\\n",
+ "210240 Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ "148715 Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ "219516 Yasuhiko Minamide \n",
+ "370814 Virginia Frey, Leigh M. Norris, Lorenza Viola ... \n",
+ "228464 Gabriel Aut\\`es, Anna Isaeva, Luca Moreschini,... \n",
+ "... ... \n",
+ "316199 Tobias Binder, Laura Covi and Kyohei Mukaida \n",
+ "414998 St\\'ephane Nonnenmacher \n",
+ "189313 George Glauberman and Justin Lynd \n",
+ "65915 S. Albrecht, S. Altenburg, C. Siegel, N. Hersc... \n",
+ "319340 Teruaki Okushima, Tomoaki Niiyama, Kensuke S. ... \n",
+ "\n",
+ " categories \\\n",
+ "210240 physics.chem-ph,physics.atm-clus,physics.optics \n",
+ "148715 quant-ph,physics.optics \n",
+ "219516 cs.FL,cs.PL \n",
+ "370814 quant-ph \n",
+ "228464 cond-mat.mtrl-sci,cond-mat.mes-hall \n",
+ "... ... \n",
+ "316199 hep-ph,astro-ph.CO,hep-th \n",
+ "414998 math-ph,math.DS,math.MP,math.SP \n",
+ "189313 math.GR,math.AT \n",
+ "65915 physics.atom-ph,nucl-ex \n",
+ "319340 cond-mat.dis-nn \n",
+ "\n",
+ " abstract \\\n",
+ "210240 We experimentally and theoretically investig... \n",
+ "148715 We experimentally demonstrate precision addr... \n",
+ "219516 The reachability analysis of weighted pushdo... \n",
+ "370814 The fragility of quantum systems makes them ... \n",
+ "228464 Recent progress in the field of topological ... \n",
+ "... ... \n",
+ "316199 Traditional computations of the dark matter ... \n",
+ "414998 The study of wave propagation outside bounde... \n",
+ "189313 A. Chermak has recently proved that to each ... \n",
+ "65915 We present and characterize a laser system f... \n",
+ "319340 Graphs of the most probable transitions for ... \n",
+ "\n",
+ " text \n",
+ "210240 Fragmentation of long-lived hydrocarbons after... \n",
+ "148715 Single spin stochastic optical reconstruction ... \n",
+ "219516 Weighted Pushdown Systems with Indexed Weight ... \n",
+ "370814 Simultaneous spectral estimation of dephasing ... \n",
+ "228464 A Novel Quasi-One-Dimensional Topological Insu... \n",
+ "... ... \n",
+ "316199 Dark Matter Sommerfeld-enhanced annihilation a... \n",
+ "414998 Resonances in hyperbolic dynamics The study ... \n",
+ "189313 Control of fixed points and existence and uniq... \n",
+ "65915 A Laser System for the Spectroscopy of Highly-... \n",
+ "319340 Changes of graph structure of transition proba... \n",
+ "\n",
+ "[5000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'astro-ph': 'Astrophysics',\n",
+ " 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',\n",
+ " 'astro-ph.EP': 'Earth and Planetary Astrophysics',\n",
+ " 'astro-ph.GA': 'Astrophysics of Galaxies',\n",
+ " 'astro-ph.HE': 'High Energy Astrophysical Phenomena',\n",
+ " 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',\n",
+ " 'astro-ph.SR': 'Solar and Stellar Astrophysics',\n",
+ " 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',\n",
+ " 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',\n",
+ " 'cond-mat.mtrl-sci': 'Materials Science',\n",
+ " 'cond-mat.other': 'Other Condensed Matter',\n",
+ " 'cond-mat.quant-gas': 'Quantum Gases',\n",
+ " 'cond-mat.soft': 'Soft Condensed Matter',\n",
+ " 'cond-mat.stat-mech': 'Statistical Mechanics',\n",
+ " 'cond-mat.str-el': 'Strongly Correlated Electrons',\n",
+ " 'cond-mat.supr-con': 'Superconductivity',\n",
+ " 'cs.AI': 'Artificial Intelligence',\n",
+ " 'cs.AR': 'Hardware Architecture',\n",
+ " 'cs.CC': 'Computational Complexity',\n",
+ " 'cs.CE': 'Computational Engineering, Finance, and Science',\n",
+ " 'cs.CG': 'Computational Geometry',\n",
+ " 'cs.CL': 'Computation and Language',\n",
+ " 'cs.CR': 'Cryptography and Security',\n",
+ " 'cs.CV': 'Computer Vision and Pattern Recognition',\n",
+ " 'cs.CY': 'Computers and Society',\n",
+ " 'cs.DB': 'Databases',\n",
+ " 'cs.DC': 'Distributed, Parallel, and Cluster Computing',\n",
+ " 'cs.DL': 'Digital Libraries',\n",
+ " 'cs.DM': 'Discrete Mathematics',\n",
+ " 'cs.DS': 'Data Structures and Algorithms',\n",
+ " 'cs.ET': 'Emerging Technologies',\n",
+ " 'cs.FL': 'Formal Languages and Automata Theory',\n",
+ " 'cs.GL': 'General Literature',\n",
+ " 'cs.GR': 'Graphics',\n",
+ " 'cs.GT': 'Computer Science and Game Theory',\n",
+ " 'cs.HC': 'Human-Computer Interaction',\n",
+ " 'cs.IR': 'Information Retrieval',\n",
+ " 'cs.IT': 'Information Theory',\n",
+ " 'cs.LG': 'Machine Learning',\n",
+ " 'cs.LO': 'Logic in Computer Science',\n",
+ " 'cs.MA': 'Multiagent Systems',\n",
+ " 'cs.MM': 'Multimedia',\n",
+ " 'cs.MS': 'Mathematical Software',\n",
+ " 'cs.NA': 'Numerical Analysis',\n",
+ " 'cs.NE': 'Neural and Evolutionary Computing',\n",
+ " 'cs.NI': 'Networking and Internet Architecture',\n",
+ " 'cs.OH': 'Other Computer Science',\n",
+ " 'cs.OS': 'Operating Systems',\n",
+ " 'cs.PF': 'Performance',\n",
+ " 'cs.PL': 'Programming Languages',\n",
+ " 'cs.RO': 'Robotics',\n",
+ " 'cs.SC': 'Symbolic Computation',\n",
+ " 'cs.SD': 'Sound',\n",
+ " 'cs.SE': 'Software Engineering',\n",
+ " 'cs.SI': 'Social and Information Networks',\n",
+ " 'cs.SY': 'Systems and Control',\n",
+ " 'econ.EM': 'Econometrics',\n",
+ " 'eess.AS': 'Audio and Speech Processing',\n",
+ " 'eess.IV': 'Image and Video Processing',\n",
+ " 'eess.SP': 'Signal Processing',\n",
+ " 'gr-qc': 'General Relativity and Quantum Cosmology',\n",
+ " 'hep-ex': 'High Energy Physics - Experiment',\n",
+ " 'hep-lat': 'High Energy Physics - Lattice',\n",
+ " 'hep-ph': 'High Energy Physics - Phenomenology',\n",
+ " 'hep-th': 'High Energy Physics - Theory',\n",
+ " 'math.AC': 'Commutative Algebra',\n",
+ " 'math.AG': 'Algebraic Geometry',\n",
+ " 'math.AP': 'Analysis of PDEs',\n",
+ " 'math.AT': 'Algebraic Topology',\n",
+ " 'math.CA': 'Classical Analysis and ODEs',\n",
+ " 'math.CO': 'Combinatorics',\n",
+ " 'math.CT': 'Category Theory',\n",
+ " 'math.CV': 'Complex Variables',\n",
+ " 'math.DG': 'Differential Geometry',\n",
+ " 'math.DS': 'Dynamical Systems',\n",
+ " 'math.FA': 'Functional Analysis',\n",
+ " 'math.GM': 'General Mathematics',\n",
+ " 'math.GN': 'General Topology',\n",
+ " 'math.GR': 'Group Theory',\n",
+ " 'math.GT': 'Geometric Topology',\n",
+ " 'math.HO': 'History and Overview',\n",
+ " 'math.IT': 'Information Theory',\n",
+ " 'math.KT': 'K-Theory and Homology',\n",
+ " 'math.LO': 'Logic',\n",
+ " 'math.MG': 'Metric Geometry',\n",
+ " 'math.MP': 'Mathematical Physics',\n",
+ " 'math.NA': 'Numerical Analysis',\n",
+ " 'math.NT': 'Number Theory',\n",
+ " 'math.OA': 'Operator Algebras',\n",
+ " 'math.OC': 'Optimization and Control',\n",
+ " 'math.PR': 'Probability',\n",
+ " 'math.QA': 'Quantum Algebra',\n",
+ " 'math.RA': 'Rings and Algebras',\n",
+ " 'math.RT': 'Representation Theory',\n",
+ " 'math.SG': 'Symplectic Geometry',\n",
+ " 'math.SP': 'Spectral Theory',\n",
+ " 'math.ST': 'Statistics Theory',\n",
+ " 'math-ph': 'Mathematical Physics',\n",
+ " 'nlin.AO': 'Adaptation and Self-Organizing Systems',\n",
+ " 'nlin.CD': 'Chaotic Dynamics',\n",
+ " 'nlin.CG': 'Cellular Automata and Lattice Gases',\n",
+ " 'nlin.PS': 'Pattern Formation and Solitons',\n",
+ " 'nlin.SI': 'Exactly Solvable and Integrable Systems',\n",
+ " 'nucl-ex': 'Nuclear Experiment',\n",
+ " 'nucl-th': 'Nuclear Theory',\n",
+ " 'physics.acc-ph': 'Accelerator Physics',\n",
+ " 'physics.ao-ph': 'Atmospheric and Oceanic Physics',\n",
+ " 'physics.app-ph': 'Applied Physics',\n",
+ " 'physics.atm-clus': 'Atomic and Molecular Clusters',\n",
+ " 'physics.atom-ph': 'Atomic Physics',\n",
+ " 'physics.bio-ph': 'Biological Physics',\n",
+ " 'physics.chem-ph': 'Chemical Physics',\n",
+ " 'physics.class-ph': 'Classical Physics',\n",
+ " 'physics.comp-ph': 'Computational Physics',\n",
+ " 'physics.data-an': 'Data Analysis, Statistics and Probability',\n",
+ " 'physics.ed-ph': 'Physics Education',\n",
+ " 'physics.flu-dyn': 'Fluid Dynamics',\n",
+ " 'physics.gen-ph': 'General Physics',\n",
+ " 'physics.geo-ph': 'Geophysics',\n",
+ " 'physics.hist-ph': 'History and Philosophy of Physics',\n",
+ " 'physics.ins-det': 'Instrumentation and Detectors',\n",
+ " 'physics.med-ph': 'Medical Physics',\n",
+ " 'physics.optics': 'Optics',\n",
+ " 'physics.plasm-ph': 'Plasma Physics',\n",
+ " 'physics.pop-ph': 'Popular Physics',\n",
+ " 'physics.soc-ph': 'Physics and Society',\n",
+ " 'physics.space-ph': 'Space Physics',\n",
+ " 'q-bio.BM': 'Biomolecules',\n",
+ " 'q-bio.CB': 'Cell Behavior',\n",
+ " 'q-bio.GN': 'Genomics',\n",
+ " 'q-bio.MN': 'Molecular Networks',\n",
+ " 'q-bio.NC': 'Neurons and Cognition',\n",
+ " 'q-bio.OT': 'Other Quantitative Biology',\n",
+ " 'q-bio.PE': 'Populations and Evolution',\n",
+ " 'q-bio.QM': 'Quantitative Methods',\n",
+ " 'q-bio.SC': 'Subcellular Processes',\n",
+ " 'q-bio.TO': 'Tissues and Organs',\n",
+ " 'q-fin.CP': 'Computational Finance',\n",
+ " 'q-fin.EC': 'Economics',\n",
+ " 'q-fin.GN': 'General Finance',\n",
+ " 'q-fin.MF': 'Mathematical Finance',\n",
+ " 'q-fin.PM': 'Portfolio Management',\n",
+ " 'q-fin.PR': 'Pricing of Securities',\n",
+ " 'q-fin.RM': 'Risk Management',\n",
+ " 'q-fin.ST': 'Statistical Finance',\n",
+ " 'q-fin.TR': 'Trading and Market Microstructure',\n",
+ " 'quant-ph': 'Quantum Physics',\n",
+ " 'stat.AP': 'Applications',\n",
+ " 'stat.CO': 'Computation',\n",
+ " 'stat.ME': 'Methodology',\n",
+ " 'stat.ML': 'Machine Learning',\n",
+ " 'stat.OT': 'Other Statistics',\n",
+ " 'stat.TH': 'Statistics Theory'}"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "CATEGORIES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5000, 6)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " title \n",
+ " year \n",
+ " authors \n",
+ " categories \n",
+ " abstract \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 210240 \n",
+ " 1512.07410 \n",
+ " Fragmentation of long-lived hydrocarbons after... \n",
+ " 2016 \n",
+ " Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ " physics.chem-ph,physics.atm-clus,physics.optics \n",
+ " We experimentally and theoretically investig... \n",
+ " \n",
+ " \n",
+ " 148715 \n",
+ " 1404.1520 \n",
+ " Single spin stochastic optical reconstruction ... \n",
+ " 2014 \n",
+ " Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ " quant-ph,physics.optics \n",
+ " We experimentally demonstrate precision addr... \n",
+ " \n",
+ " \n",
+ " 219516 \n",
+ " 1603.07790 \n",
+ " Weighted Pushdown Systems with Indexed Weight ... \n",
+ " 2016 \n",
+ " Yasuhiko Minamide \n",
+ " cs.FL,cs.PL \n",
+ " The reachability analysis of weighted pushdo... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id title year \\\n",
+ "210240 1512.07410 Fragmentation of long-lived hydrocarbons after... 2016 \n",
+ "148715 1404.1520 Single spin stochastic optical reconstruction ... 2014 \n",
+ "219516 1603.07790 Weighted Pushdown Systems with Indexed Weight ... 2016 \n",
+ "\n",
+ " authors \\\n",
+ "210240 Seyedreza Larimian, Sonia Erattupuzha, Erik L\\... \n",
+ "148715 Matthias Pfender, Nabeel Aslam, Gerald Waldher... \n",
+ "219516 Yasuhiko Minamide \n",
+ "\n",
+ " categories \\\n",
+ "210240 physics.chem-ph,physics.atm-clus,physics.optics \n",
+ "148715 quant-ph,physics.optics \n",
+ "219516 cs.FL,cs.PL \n",
+ "\n",
+ " abstract \n",
+ "210240 We experimentally and theoretically investig... \n",
+ "148715 We experimentally demonstrate precision addr... \n",
+ "219516 The reachability analysis of weighted pushdo... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['text'] = df['title'] + ' ' + df['abstract']\n",
+ "# df['categories'] = df['categories'].apply(lambda x: x.split(','))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'physics.chem-ph,physics.atm-clus,physics.optics'"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.iloc[0].categories"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train dataset creation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((5000, 7), (4000, 7), (1000, 7))"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_train, df_test = train_test_split(df, train_size=0.8)\n",
+ "\n",
+ "df.shape, df_train.shape, df_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_tokenizer(tokenizer_model):\n",
+ " def tokenize_function(examples):\n",
+ " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
+ "\n",
+ " tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)\n",
+ " return tokenize_function, tokenizer\n",
+ "\n",
+ "tokenize_function, tokenizer = get_tokenizer('bert-base-uncased')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop samples where there are categories which should not be present according to our predefined categories.py\n",
+ "\n",
+ "df['split_categories'] = df['categories'].apply(lambda x: x.split(','))\n",
+ "\n",
+ "df = df[\n",
+ " df['split_categories'].apply(lambda x: len(set(x) - set(CATEGORIES)) == 0)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA',\n",
+ " 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn',\n",
+ " 'cond-mat.mes-hall', 'cond-mat.mtrl-sci'], dtype=object)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mlb = MultiLabelBinarizer()\n",
+ "# mlb.fit([[(k,v) for k, v in CATEGORIES.items()]]) #df_train['categories'])\n",
+ "mlb.fit([list(CATEGORIES.keys())]) #df_train['categories'])\n",
+ "mlb.classes_[:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocess_data(examples):\n",
+ " # take a batch of texts\n",
+ " text = examples[\"text\"]\n",
+ "\n",
+ " # encode them\n",
+ " encoding = tokenizer(text, padding=\"max_length\", truncation=True, max_length=128)\n",
+ "\n",
+ " encoded_categories = mlb.transform([c.split(',') for c in examples['categories']]).astype(float)\n",
+ "\n",
+ " encoding[\"labels\"] = encoded_categories\n",
+ "\n",
+ " return encoding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9692bf48791b4a1d87b8c5d796cef366",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/4 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/paulo/.local/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:878: UserWarning: unknown class(es) ['econ.GN', 'eess.SY'] will be ignored\n",
+ " warnings.warn(\n",
+ "/home/paulo/.local/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:878: UserWarning: unknown class(es) ['econ.GN', 'econ.TH', 'eess.SY'] will be ignored\n",
+ " warnings.warn(\n",
+ "/home/paulo/.local/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:878: UserWarning: unknown class(es) ['eess.SY'] will be ignored\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d6bc7b01dfa843f69bf4ba3148a50f09",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/1 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/paulo/.local/lib/python3.9/site-packages/sklearn/preprocessing/_label.py:878: UserWarning: unknown class(es) ['econ.GN', 'eess.SY'] will be ignored\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_train_hf = Dataset.from_pandas(df_train[['text', 'categories']])\n",
+ "tokenized_train = df_train_hf.map(preprocess_data, batched=True)\n",
+ "\n",
+ "df_test_hf = Dataset.from_pandas(df_test[['text', 'categories']])\n",
+ "tokenized_test = df_test_hf.map(preprocess_data, batched=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Reversed [('cond-mat.str-el', 'hep-th')]\n",
+ "Original categories cond-mat.str-el,hep-th\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get inverse transform, as an example\n",
+ "\n",
+ "print(\"Reversed\", mlb.inverse_transform(np.asarray(tokenized_test[0]['labels']).reshape(1, -1)))\n",
+ "print(\"Original categories\", tokenized_test[0]['categories'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+ "To disable this warning, you can either:\n",
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Store multilabel binarizer as a pickle file\n",
+ "\n",
+ "!rm -rf checkpoint\n",
+ "!mkdir checkpoint\n",
+ "with open('checkpoint/mlb.pkl', 'wb') as handle:\n",
+ " pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training multi label class model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']\n",
+ "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = BertForSequenceClassification.from_pretrained(\n",
+ " \"bert-base-uncased\",\n",
+ " num_labels=len(mlb.classes_),\n",
+ " problem_type=\"multi_label_classification\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Adaptation: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb \n",
+ "# \n",
+ "\n",
+ "# Define batch size according to your GPU RAM\n",
+ "batch_size = 8\n",
+ "nb_epochs = 1 # DEMONSTRATIVE PURPOSES\n",
+ "metric_name = \"f1\"\n",
+ "\n",
+ "args = TrainingArguments(\n",
+ " f\"paper-multilabel-finetuning\",\n",
+ " evaluation_strategy=\"epoch\",\n",
+ " save_strategy=\"epoch\",\n",
+ " save_total_limit=1,\n",
+ " learning_rate=2e-5,\n",
+ " per_device_train_batch_size=batch_size,\n",
+ " per_device_eval_batch_size=batch_size//2,\n",
+ " num_train_epochs=nb_epochs,\n",
+ " weight_decay=0.01,\n",
+ " load_best_model_at_end=True,\n",
+ " metric_for_best_model=metric_name,\n",
+ " eval_accumulation_steps=1,\n",
+ ")\n",
+ "\n",
+ "# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/\n",
+ "def multi_label_metrics(predictions, labels, threshold=0.5):\n",
+ " # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)\n",
+ " sigmoid = torch.nn.Sigmoid()\n",
+ " probs = sigmoid(torch.Tensor(predictions))\n",
+ " # next, use threshold to turn them into integer predictions\n",
+ " y_pred = np.zeros(probs.shape)\n",
+ " y_pred[np.where(probs >= threshold)] = 1\n",
+ " # finally, compute metrics\n",
+ " y_true = labels\n",
+ " f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n",
+ " roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n",
+ " accuracy = accuracy_score(y_true, y_pred)\n",
+ " # return as dictionary\n",
+ " metrics = {'f1': f1_micro_average,\n",
+ " 'roc_auc': roc_auc,\n",
+ " 'accuracy': accuracy}\n",
+ " return metrics\n",
+ "\n",
+ "def compute_metrics(p: EvalPrediction):\n",
+ " preds = p.predictions[0] if isinstance(p.predictions, \n",
+ " tuple) else p.predictions\n",
+ " return multi_label_metrics(\n",
+ " predictions=p.predictions, \n",
+ " labels=p.label_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trainer = Trainer(\n",
+ " model,\n",
+ " args,\n",
+ " train_dataset=tokenized_train,\n",
+ " eval_dataset=tokenized_test,\n",
+ " tokenizer=tokenizer,\n",
+ " compute_metrics=compute_metrics\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/a.yushkovskiy/gh/atemate/redis-arXiv-search/.venv/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+ " warnings.warn(\n",
+ "***** Running training *****\n",
+ " Num examples = 48\n",
+ " Num Epochs = 1\n",
+ " Instantaneous batch size per device = 8\n",
+ " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
+ " Gradient Accumulation steps = 1\n",
+ " Total optimization steps = 6\n",
+ " Number of trainable parameters = 109599897\n",
+ "The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n",
+ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [6/6 00:08, Epoch 1/1]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Epoch \n",
+ " Training Loss \n",
+ " Validation Loss \n",
+ " F1 \n",
+ " Roc Auc \n",
+ " Accuracy \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " No log \n",
+ " 0.657666 \n",
+ " 0.026588 \n",
+ " 0.502474 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "***** Running Evaluation *****\n",
+ " Num examples = 12\n",
+ " Batch size = 4\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n",
+ "Saving model checkpoint to paper-multilabel-finetuning/checkpoint-6\n",
+ "Configuration saved in paper-multilabel-finetuning/checkpoint-6/config.json\n",
+ "Model weights saved in paper-multilabel-finetuning/checkpoint-6/pytorch_model.bin\n",
+ "tokenizer config file saved in paper-multilabel-finetuning/checkpoint-6/tokenizer_config.json\n",
+ "Special tokens file saved in paper-multilabel-finetuning/checkpoint-6/special_tokens_map.json\n",
+ "\n",
+ "\n",
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+ "\n",
+ "\n",
+ "Loading best model from paper-multilabel-finetuning/checkpoint-6 (score: 0.026587887740029542).\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=6, training_loss=0.6783577601114908, metrics={'train_runtime': 9.3058, 'train_samples_per_second': 5.158, 'train_steps_per_second': 0.645, 'total_flos': 3161613275136.0, 'train_loss': 0.6783577601114908, 'epoch': 1.0})"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "***** Running Evaluation *****\n",
+ " Num examples = 12\n",
+ " Batch size = 4\n",
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, categories, __index_level_0__. If text, categories, __index_level_0__ are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " [3/3 00:00]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'eval_loss': 0.6576664447784424,\n",
+ " 'eval_f1': 0.026587887740029542,\n",
+ " 'eval_roc_auc': 0.5024737713970182,\n",
+ " 'eval_accuracy': 0.0,\n",
+ " 'eval_runtime': 0.6499,\n",
+ " 'eval_samples_per_second': 18.465,\n",
+ " 'eval_steps_per_second': 4.616,\n",
+ " 'epoch': 1.0}"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "eval_res = trainer.evaluate()\n",
+ "eval_res"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Perform inference on a given text sample"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "astro-ph.HE\n"
+ ]
+ }
+ ],
+ "source": [
+ "text = df['text'].iloc[5]\n",
+ "categories = df['categories'].iloc[5]\n",
+ "print(categories)\n",
+ "\n",
+ "encoding = tokenizer(text, return_tensors=\"pt\")\n",
+ "encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}\n",
+ "\n",
+ "outputs = trainer.model(**encoding)\n",
+ "logits = outputs.logits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# apply sigmoid + threshold\n",
+ "sigmoid = torch.nn.Sigmoid()\n",
+ "probs = sigmoid(logits.squeeze().cpu())\n",
+ "predictions = np.zeros(probs.shape)\n",
+ "predictions[np.where(probs >= 0.3)] = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "A Search for MeV to TeV Neutrinos from Fast Radio Bursts with IceCube We present two searches for IceCube neutrino events coincident with 28 fast\n",
+ "radio bursts (FRBs) and one repeating FRB. The first improves upon a previous\n",
+ "IceCube analysis -- searching for spatial and temporal correlation of events\n",
+ "with FRBs at energies greater than roughly 50 GeV -- by increasing the\n",
+ "effective area by an order of magnitude. The second is a search for temporal\n",
+ "correlation of MeV neutrino events with FRBs. No significant correlation is\n",
+ "found in either search, therefore, we set upper limits on the time-integrated\n",
+ "neutrino flux emitted by FRBs for a range of emission timescales less than one\n",
+ "day. These are the first limits on FRB neutrino emission at the MeV scale, and\n",
+ "the limits set at higher energies are an order-of-magnitude improvement over\n",
+ "those set by any neutrino telescope.\n",
+ "\n",
+ "[('astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con', 'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY', 'econ.EM', 'eess.AS', 'eess.IV', 'eess.SP', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MG', 'math.NA', 'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA', 'math.RA', 'math.RT', 'math.SG', 'math.SP', 'math.ST', 'nlin.AO', 'nlin.CD', 'nlin.CG', 'nlin.PS', 'nlin.SI', 'nucl-ex', 'nucl-th', 'physics.acc-ph', 'physics.ao-ph', 'physics.app-ph', 'physics.atm-clus', 'physics.atom-ph', 'physics.bio-ph', 'physics.chem-ph', 'physics.class-ph', 'physics.comp-ph', 'physics.data-an', 'physics.ed-ph', 'physics.flu-dyn', 'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph', 'physics.ins-det', 'physics.med-ph', 'physics.optics', 'physics.plasm-ph', 'physics.pop-ph', 'physics.soc-ph', 'physics.space-ph', 'q-bio.BM', 'q-bio.CB', 'q-bio.GN', 'q-bio.MN', 'q-bio.NC', 'q-bio.OT', 'q-bio.PE', 'q-bio.QM', 'q-bio.SC', 'q-fin.CP', 'q-fin.EC', 'q-fin.GN', 'q-fin.MF', 'q-fin.PM', 'q-fin.PR', 'q-fin.RM', 'q-fin.ST', 'q-fin.TR', 'quant-ph', 'stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.OT', 'stat.TH')]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(text)\n",
+ "print(mlb.inverse_transform(predictions.reshape(1, -1)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Saving model checkpoint to ./checkpoint\n",
+ "Configuration saved in ./checkpoint/config.json\n",
+ "Model weights saved in ./checkpoint/pytorch_model.bin\n",
+ "tokenizer config file saved in ./checkpoint/tokenizer_config.json\n",
+ "Special tokens file saved in ./checkpoint/special_tokens_map.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "trainer.save_model(output_dir='./checkpoint')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('./checkpoint/model_info.json', 'w') as f:\n",
+ " f.write(json.dumps(eval_res, indent=4))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data/multilabel_classifier/requirements.txt b/data/multilabel_classifier/requirements.txt
new file mode 100644
index 0000000..1033e90
--- /dev/null
+++ b/data/multilabel_classifier/requirements.txt
@@ -0,0 +1,2 @@
+-e ../../backend # to access vecsim_app utils
+datasets
diff --git a/docker-local-redis.yml b/docker-local-redis.yml
index 3d9ee39..79b5860 100644
--- a/docker-local-redis.yml
+++ b/docker-local-redis.yml
@@ -10,6 +10,11 @@ services:
ports:
- 6379:6379
- 8001:8001
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 3s
+ timeout: 5s
+ retries: 5
backend:
image: ghcr.io/redisventures/redis-arxiv-search:latest
environment:
diff --git a/frontend/.tool-versions b/frontend/.tool-versions
new file mode 100644
index 0000000..0094556
--- /dev/null
+++ b/frontend/.tool-versions
@@ -0,0 +1 @@
+nodejs 16.14.2
diff --git a/frontend/package.json b/frontend/package.json
index 1595c87..fd9495f 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -9,7 +9,6 @@
"@material-ui/icons": "^4.11.3",
"@mui/icons-material": "^5.8.4",
"@mui/material": "^5.8.4",
- "material-ui-search-bar": "^1.0.0",
"@testing-library/jest-dom": "^5.14.1",
"@testing-library/react": "^13.0.0",
"@testing-library/user-event": "^13.2.1",
@@ -19,8 +18,10 @@
"@types/react-dom": "^18.0.0",
"autoprefixer": "10.4.5",
"bootstrap": "^5.1.3",
+ "immer": "^9.0.16",
"jwt-decode": "^3.1.2",
"material-ui": "^0.20.2",
+ "material-ui-search-bar": "^1.0.0",
"ra-data-simple-rest": "^4.2.0",
"react": "^18.2.0",
"react-admin": "^4.2.0",
@@ -29,7 +30,9 @@
"react-router": "^6.3.0",
"react-router-dom": "^6.3.0",
"react-scripts": "5.0.1",
+ "styled-components": "^5.3.6",
"typescript": "^4.4.2",
+ "use-immer": "^0.7.0",
"web-vitals": "^2.1.0"
},
"scripts": {
@@ -56,5 +59,8 @@
"last 1 safari version"
]
},
- "proxy": "http://localhost:8888"
+ "proxy": "http://localhost:8888",
+ "devDependencies": {
+ "@types/styled-components": "^5.1.26"
+ }
}
diff --git a/frontend/public/index.html b/frontend/public/index.html
index 614bb7f..769be84 100644
--- a/frontend/public/index.html
+++ b/frontend/public/index.html
@@ -9,6 +9,11 @@
name="description"
content="Redis vector similarity search demonstration"
/>
+
diff --git a/frontend/src/Layout.tsx b/frontend/src/Layout.tsx
index e42a91d..f99edd7 100644
--- a/frontend/src/Layout.tsx
+++ b/frontend/src/Layout.tsx
@@ -6,30 +6,13 @@ import { Footer } from './views/Footer';
export const Layout: FC = () => {
- const [papers, setPapers] = useState([]);
- const [categories, setCategories] = useState([]);
- const [years, setYears] = useState([]);
- const [state, setState] = useState('');
- const [total, setTotal] = useState(0);
-
return (
<>
-
+
>
);
};
-export default Layout;
\ No newline at end of file
+export default Layout;
diff --git a/frontend/src/Routes.tsx b/frontend/src/Routes.tsx
index d9cb190..80878e4 100644
--- a/frontend/src/Routes.tsx
+++ b/frontend/src/Routes.tsx
@@ -1,15 +1,11 @@
import { FC } from 'react';
import { BrowserRouter as Router, Route, Routes} from 'react-router-dom';
import Layout from './Layout';
-import { Admin } from './admin';
export const AppRoutes: FC = () => {
return (
-
- } />
-
}/>
diff --git a/frontend/src/admin/Admin.tsx b/frontend/src/admin/Admin.tsx
deleted file mode 100644
index d301931..0000000
--- a/frontend/src/admin/Admin.tsx
+++ /dev/null
@@ -1,19 +0,0 @@
-import { FC } from 'react';
-import { Admin as ReactAdmin, Resource } from 'react-admin';
-import authProvider from './authProvider';
-import { UserList, UserEdit, UserCreate } from './Users';
-import { dataProvider } from './dataProvider';
-
-
-export const Admin: FC = () => {
- return (
-
-
-
- );
-};
diff --git a/frontend/src/admin/Users/UserCreate.tsx b/frontend/src/admin/Users/UserCreate.tsx
deleted file mode 100644
index 1795f7e..0000000
--- a/frontend/src/admin/Users/UserCreate.tsx
+++ /dev/null
@@ -1,23 +0,0 @@
-import React, { FC } from 'react';
-import {
- Create,
- SimpleForm,
- TextInput,
- PasswordInput,
- BooleanInput,
-} from 'react-admin';
-
-export const UserCreate: FC = (props) => (
-
-
-
-
-
-
-
-
-
-
-
-
-);
diff --git a/frontend/src/admin/Users/UserEdit.tsx b/frontend/src/admin/Users/UserEdit.tsx
deleted file mode 100644
index d66ab89..0000000
--- a/frontend/src/admin/Users/UserEdit.tsx
+++ /dev/null
@@ -1,22 +0,0 @@
-import React, { FC } from 'react';
-import {
- Edit,
- SimpleForm,
- TextInput,
- PasswordInput,
- BooleanInput,
-} from 'react-admin';
-
-export const UserEdit: FC = (props) => (
-
-
-
-
-
-
-
-
-
-
-
-);
diff --git a/frontend/src/admin/Users/UserList.tsx b/frontend/src/admin/Users/UserList.tsx
deleted file mode 100644
index 241a594..0000000
--- a/frontend/src/admin/Users/UserList.tsx
+++ /dev/null
@@ -1,26 +0,0 @@
-// in src/users.js
-import React, { FC } from 'react';
-import {
- List,
- Datagrid,
- TextField,
- BooleanField,
- EmailField,
- EditButton,
-} from 'react-admin';
-
-export const UserList: FC = (props) => (
-
-
-
-
-
-
-
-
-
-
-
-
-
-);
diff --git a/frontend/src/admin/Users/index.ts b/frontend/src/admin/Users/index.ts
deleted file mode 100644
index 999f7e0..0000000
--- a/frontend/src/admin/Users/index.ts
+++ /dev/null
@@ -1,3 +0,0 @@
-export * from './UserEdit';
-export * from './UserList';
-export * from './UserCreate';
diff --git a/frontend/src/admin/authProvider.ts b/frontend/src/admin/authProvider.ts
deleted file mode 100644
index 3adeef8..0000000
--- a/frontend/src/admin/authProvider.ts
+++ /dev/null
@@ -1,56 +0,0 @@
-import decodeJwt from 'jwt-decode';
-import { BASE_URL } from '../config';
-
-type loginFormType = {
- username: string;
- password: string;
-};
-
-const authProvider = {
- login: ({ username, password }: loginFormType) => {
- let formData = new FormData();
- formData.append('username', username);
- formData.append('password', password);
- const request = new Request(BASE_URL + '/api/token', {
- method: 'POST',
- body: formData,
- });
- return fetch(request)
- .then((response) => {
- if (response.status < 200 || response.status >= 300) {
- throw new Error(response.statusText);
- }
- return response.json();
- })
- .then(({ access_token }) => {
- const decodedToken: any = decodeJwt(access_token);
- if (decodedToken.permissions !== 'admin') {
- throw new Error('Forbidden');
- }
- localStorage.setItem('token', access_token);
- localStorage.setItem('permissions', decodedToken.permissions);
- });
- },
- logout: () => {
- localStorage.removeItem('token');
- localStorage.removeItem('permissions');
- return Promise.resolve();
- },
- checkError: (error: { status: number }) => {
- const status = error.status;
- if (status === 401 || status === 403) {
- localStorage.removeItem('token');
- return Promise.reject();
- }
- return Promise.resolve();
- },
- checkAuth: () =>
- localStorage.getItem('token') ? Promise.resolve() : Promise.reject(),
- getPermissions: () => {
- const role = localStorage.getItem('permissions');
- return role ? Promise.resolve(role) : Promise.reject();
- // localStorage.getItem('token') ? Promise.resolve() : Promise.reject(),
- },
-};
-
-export default authProvider;
diff --git a/frontend/src/admin/dataProvider.js b/frontend/src/admin/dataProvider.js
deleted file mode 100644
index 64d2e17..0000000
--- a/frontend/src/admin/dataProvider.js
+++ /dev/null
@@ -1,19 +0,0 @@
-// Inspired by https://github.com/marmelab/admin-on-rest/blob/master/docs/Tutorial.md
-import { BASE_URL } from '../config';
-import { fetchUtils } from 'react-admin';
-import simpleRestProvider from 'ra-data-simple-rest';
-
-const httpClient = (url, options) => {
- if (!options) {
- options = {};
- }
- if (!options.headers) {
- options.headers = new Headers({ Accept: 'application/json' });
- }
- const token = localStorage.getItem('token');
- options.headers.set('Authorization', `Bearer ${token}`);
- return fetchUtils.fetchJson(url, options);
- };
-
- // TODO Implement custom data provider to fix the id problem with users
-export const dataProvider = simpleRestProvider(BASE_URL + "/api/v1", httpClient);
diff --git a/frontend/src/admin/index.ts b/frontend/src/admin/index.ts
deleted file mode 100644
index c956a8f..0000000
--- a/frontend/src/admin/index.ts
+++ /dev/null
@@ -1 +0,0 @@
-export * from './Admin';
diff --git a/frontend/src/api.ts b/frontend/src/api.ts
index 551d12d..138c67d 100644
--- a/frontend/src/api.ts
+++ b/frontend/src/api.ts
@@ -31,14 +31,14 @@ export const fetchFromBackend = async (url: string, method: string, body?: any)
return data;
}
-export const getPapers = async (limit=15, skip=0, years: string[] = [], categories: string[] = []) => {
+export const getPapers = async (limit = 15, skip = 0, years: string[] = [], categories: string[] = []) => {
var params: string;
- if ( !years.length && !categories.length ) {
+ if (!years.length && !categories.length) {
var params = `?limit=${limit}&skip=${skip}`
} else {
- if ( years.length && categories.length ) {
+ if (years.length && categories.length) {
var params = `?limit=${limit}&skip=${skip}&years=${years.join()}&categories=${categories.join()}`
- } else if ( years.length ) {
+ } else if (years.length) {
var params = `?limit=${limit}&skip=${skip}&years=${years.join()}`
} else {
var params = `?limit=${limit}&skip=${skip}&categories=${categories.join()}`
@@ -49,38 +49,71 @@ export const getPapers = async (limit=15, skip=0, years: string[] = [], categori
// get papers from Redis through the FastAPI backend
-export const getSemanticallySimilarPapers = async (paper_id: string,
- years: string[],
- categories: string[],
- search='KNN',
- limit=15) => {
- console.log(paper_id);
+interface PaperSearchRequest {
+ years: string[];
+ categories: string[];
+ search?: 'KNN' | 'ANN'
+ limit?: number;
+ matchExactCategories?: boolean;
+}
+
+interface SimilarPaperSearchRequest extends PaperSearchRequest {
+ paper_id: string;
+}
+
+export const getSemanticallySimilarPapers = async ({
+ paper_id,
+ years,
+ categories,
+ search = 'KNN',
+ limit = 15,
+ matchExactCategories = false
+}: SimilarPaperSearchRequest) => {
let body = {
paper_id: paper_id,
search_type: search,
number_of_results: limit,
years: years,
- categories: categories
+ categories: categories,
+ categories_operator: matchExactCategories ? 'AND' : 'OR'
}
const url = MASTER_URL + "vectorsearch/text";
return fetchFromBackend(url, 'POST', body);
};
+interface TextPaperSearchRequest extends PaperSearchRequest {
+ searchItems: string[],
+}
-export const getSemanticallySimilarPapersbyText = async (text: string,
- years: string[],
- categories: string[],
- search='KNN',
- limit=15) => {
+export const getSemanticallySimilarPapersbyText = async ({
+ searchItems,
+ years,
+ categories,
+ search = 'KNN',
+ limit = 15,
+ matchExactCategories = false
+}: TextPaperSearchRequest) => {
let body = {
- user_text: text,
+ articles: searchItems.map(text => ({ text })),
search_type: search,
number_of_results: limit,
years: years,
- categories: categories
+ categories,
+ categories_operator: matchExactCategories ? 'AND' : 'OR'
}
const url = MASTER_URL + "vectorsearch/text/user";
return fetchFromBackend(url, 'POST', body);
};
+
+
+export const getSuggestedCategories = async (
+ articles: string[]
+) => {
+ let body = { articles }
+
+ const url = MASTER_URL + "predict-categories"
+ const { categories } = await fetchFromBackend(url, 'POST', body)
+ return categories || []
+};
diff --git a/frontend/src/components/Search/index.tsx b/frontend/src/components/Search/index.tsx
new file mode 100644
index 0000000..946f782
--- /dev/null
+++ b/frontend/src/components/Search/index.tsx
@@ -0,0 +1,28 @@
+
+import { OnSearchItemRemove, OnSearchStateChange, SearchStates } from "../../types/search";
+
+import { SearchBar } from '../../ui/SearchBar'
+
+interface Props {
+ searchStates: SearchStates
+ onSearchStateChange: OnSearchStateChange
+ onSearchItemRemove: OnSearchItemRemove
+}
+
+export const Search = ({ searchStates, ...props }: Props) => {
+
+ return (
+
+ {searchStates.map(
+ (searchState, index) =>
+ 1}
+ {...props}
+ />
+ )}
+
+ )
+}
diff --git a/frontend/src/components/SearchFilters/index.tsx b/frontend/src/components/SearchFilters/index.tsx
new file mode 100644
index 0000000..6d1a580
--- /dev/null
+++ b/frontend/src/components/SearchFilters/index.tsx
@@ -0,0 +1,101 @@
+import {
+ OutlinedInput,
+ InputLabel,
+ MenuItem,
+ FormControl,
+ ListItemText,
+ Button,
+ Checkbox,
+ Tooltip,
+ Select,
+ SelectChangeEvent,
+ CircularProgress
+} from '@mui/material';
+
+import { CATEGORY_HUMAN_NAMES, YEAR_FILTER_OPTIONS } from '../../constants/search_filter';
+import { SelectControlsWrapper, SelectControl } from './styles';
+
+const ITEM_HEIGHT = 48;
+const ITEM_PADDING_TOP = 8;
+const MenuProps = {
+ PaperProps: {
+ style: {
+ maxHeight: ITEM_HEIGHT * 4.5 + ITEM_PADDING_TOP,
+ width: 150,
+ },
+ },
+};
+
+interface Props {
+ years: string[];
+ onYearSelection: (event: SelectChangeEvent) => void;
+ categories: string[];
+ onCategorySelection: (event: SelectChangeEvent) => void;
+ matchExactCategories: boolean;
+ onMatchExactCategoriesChange: () => void
+}
+
+
+export const SearchFilters = ({
+ years,
+ onYearSelection,
+ categories,
+ onCategorySelection,
+ matchExactCategories,
+ onMatchExactCategoriesChange
+}: Props) => {
+
+ return (
+
+
+ Year
+ }
+ renderValue={(selected) => selected.join(', ')}
+ MenuProps={MenuProps}
+ >
+ {YEAR_FILTER_OPTIONS.map((year) => (
+
+ -1} />
+
+
+ ))}
+
+
+
+ Categories
+ }
+ renderValue={(selected) => selected.join(', ')}
+ MenuProps={MenuProps}
+ >
+
+
+ Settings:
+
+
+ Match exact combination of categories
+
+
+
+ {Object.entries(CATEGORY_HUMAN_NAMES).map(([slug, name]) => (
+
+ -1} />
+
+
+ ))}
+
+
+
+ )
+}
diff --git a/frontend/src/components/SearchFilters/styles.ts b/frontend/src/components/SearchFilters/styles.ts
new file mode 100644
index 0000000..1528306
--- /dev/null
+++ b/frontend/src/components/SearchFilters/styles.ts
@@ -0,0 +1,21 @@
+import styled from 'styled-components'
+
+export const SelectControlsWrapper = styled.div`
+ position: sticky;
+ z-index: 3;
+ top: 0;
+ left: 0;
+ right: 0;
+
+ height: 60px;
+ line-height: 60px;
+ padding: 0 16px;
+ background-color: #eee;
+
+ display: flex;
+`
+
+export const SelectControl = styled.div`
+ cursor: pointer;
+ user-select: none;
+`
diff --git a/frontend/src/components/SuggestedCategories/index.tsx b/frontend/src/components/SuggestedCategories/index.tsx
new file mode 100644
index 0000000..d62dc3e
--- /dev/null
+++ b/frontend/src/components/SuggestedCategories/index.tsx
@@ -0,0 +1,18 @@
+import { Link } from "@mui/material"
+import { Root } from './styles'
+import { CATEGORY_HUMAN_NAMES } from '../../constants/search_filter';
+
+export const SuggestedCategories = ({ options, onClick }: { options: string[], onClick: () => void }) => {
+ if (options.length === 0) return <>>
+
+ return (
+
+ Suggested categories:
+ {options.map(slug =>
+ {slug}
+ )}
+ Click here to apply them
+
+ )
+}
+
diff --git a/frontend/src/components/SuggestedCategories/styles.ts b/frontend/src/components/SuggestedCategories/styles.ts
new file mode 100644
index 0000000..099bd28
--- /dev/null
+++ b/frontend/src/components/SuggestedCategories/styles.ts
@@ -0,0 +1,10 @@
+import styled from 'styled-components'
+
+export const Root = styled.div`
+ margin-top: 1.5rem;
+ padding: 0.5rem;
+ background-color: #E5E0FF;
+ border-radius: 4px;
+
+
+`
diff --git a/frontend/src/constants/search_filter.ts b/frontend/src/constants/search_filter.ts
new file mode 100644
index 0000000..5a3a784
--- /dev/null
+++ b/frontend/src/constants/search_filter.ts
@@ -0,0 +1,170 @@
+export const YEAR_FILTER_OPTIONS = [
+ '2022',
+ '2021',
+ '2020',
+ '2019',
+ '2018',
+ '2017',
+ '2016',
+ '2015',
+ '2014',
+ '2013',
+ '2012',
+ '2011'
+]
+
+export const CATEGORY_HUMAN_NAMES = {
+ 'astro-ph': 'Astrophysics',
+ 'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
+ 'astro-ph.EP': 'Earth and Planetary Astrophysics',
+ 'astro-ph.GA': 'Astrophysics of Galaxies',
+ 'astro-ph.HE': 'High Energy Astrophysical Phenomena',
+ 'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
+ 'astro-ph.SR': 'Solar and Stellar Astrophysics',
+ 'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
+ 'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
+ 'cond-mat.mtrl-sci': 'Materials Science',
+ 'cond-mat.other': 'Other Condensed Matter',
+ 'cond-mat.quant-gas': 'Quantum Gases',
+ 'cond-mat.soft': 'Soft Condensed Matter',
+ 'cond-mat.stat-mech': 'Statistical Mechanics',
+ 'cond-mat.str-el': 'Strongly Correlated Electrons',
+ 'cond-mat.supr-con': 'Superconductivity',
+ 'cs.AI': 'Artificial Intelligence',
+ 'cs.AR': 'Hardware Architecture',
+ 'cs.CC': 'Computational Complexity',
+ 'cs.CE': 'Computational Engineering, Finance, and Science',
+ 'cs.CG': 'Computational Geometry',
+ 'cs.CL': 'Computation and Language',
+ 'cs.CR': 'Cryptography and Security',
+ 'cs.CV': 'Computer Vision and Pattern Recognition',
+ 'cs.CY': 'Computers and Society',
+ 'cs.DB': 'Databases',
+ 'cs.DC': 'Distributed, Parallel, and Cluster Computing',
+ 'cs.DL': 'Digital Libraries',
+ 'cs.DM': 'Discrete Mathematics',
+ 'cs.DS': 'Data Structures and Algorithms',
+ 'cs.ET': 'Emerging Technologies',
+ 'cs.FL': 'Formal Languages and Automata Theory',
+ 'cs.GL': 'General Literature',
+ 'cs.GR': 'Graphics',
+ 'cs.GT': 'Computer Science and Game Theory',
+ 'cs.HC': 'Human-Computer Interaction',
+ 'cs.IR': 'Information Retrieval',
+ 'cs.IT': 'Information Theory',
+ 'cs.LG': 'Machine Learning',
+ 'cs.LO': 'Logic in Computer Science',
+ 'cs.MA': 'Multiagent Systems',
+ 'cs.MM': 'Multimedia',
+ 'cs.MS': 'Mathematical Software',
+ 'cs.NA': 'Numerical Analysis',
+ 'cs.NE': 'Neural and Evolutionary Computing',
+ 'cs.NI': 'Networking and Internet Architecture',
+ 'cs.OH': 'Other Computer Science',
+ 'cs.OS': 'Operating Systems',
+ 'cs.PF': 'Performance',
+ 'cs.PL': 'Programming Languages',
+ 'cs.RO': 'Robotics',
+ 'cs.SC': 'Symbolic Computation',
+ 'cs.SD': 'Sound',
+ 'cs.SE': 'Software Engineering',
+ 'cs.SI': 'Social and Information Networks',
+ 'cs.SY': 'Systems and Control',
+ 'econ.EM': 'Econometrics',
+ 'eess.AS': 'Audio and Speech Processing',
+ 'eess.IV': 'Image and Video Processing',
+ 'eess.SP': 'Signal Processing',
+ 'gr-qc': 'General Relativity and Quantum Cosmology',
+ 'hep-ex': 'High Energy Physics - Experiment',
+ 'hep-lat': 'High Energy Physics - Lattice',
+ 'hep-ph': 'High Energy Physics - Phenomenology',
+ 'hep-th': 'High Energy Physics - Theory',
+ 'math.AC': 'Commutative Algebra',
+ 'math.AG': 'Algebraic Geometry',
+ 'math.AP': 'Analysis of PDEs',
+ 'math.AT': 'Algebraic Topology',
+ 'math.CA': 'Classical Analysis and ODEs',
+ 'math.CO': 'Combinatorics',
+ 'math.CT': 'Category Theory',
+ 'math.CV': 'Complex Variables',
+ 'math.DG': 'Differential Geometry',
+ 'math.DS': 'Dynamical Systems',
+ 'math.FA': 'Functional Analysis',
+ 'math.GM': 'General Mathematics',
+ 'math.GN': 'General Topology',
+ 'math.GR': 'Group Theory',
+ 'math.GT': 'Geometric Topology',
+ 'math.HO': 'History and Overview',
+ 'math.IT': 'Information Theory',
+ 'math.KT': 'K-Theory and Homology',
+ 'math.LO': 'Logic',
+ 'math.MG': 'Metric Geometry',
+ 'math.MP': 'Mathematical Physics',
+ 'math.NA': 'Numerical Analysis',
+ 'math.NT': 'Number Theory',
+ 'math.OA': 'Operator Algebras',
+ 'math.OC': 'Optimization and Control',
+ 'math.PR': 'Probability',
+ 'math.QA': 'Quantum Algebra',
+ 'math.RA': 'Rings and Algebras',
+ 'math.RT': 'Representation Theory',
+ 'math.SG': 'Symplectic Geometry',
+ 'math.SP': 'Spectral Theory',
+ 'math.ST': 'Statistics Theory',
+ 'math-ph': 'Mathematical Physics',
+ 'nlin.AO': 'Adaptation and Self-Organizing Systems',
+ 'nlin.CD': 'Chaotic Dynamics',
+ 'nlin.CG': 'Cellular Automata and Lattice Gases',
+ 'nlin.PS': 'Pattern Formation and Solitons',
+ 'nlin.SI': 'Exactly Solvable and Integrable Systems',
+ 'nucl-ex': 'Nuclear Experiment',
+ 'nucl-th': 'Nuclear Theory',
+ 'physics.acc-ph': 'Accelerator Physics',
+ 'physics.ao-ph': 'Atmospheric and Oceanic Physics',
+ 'physics.app-ph': 'Applied Physics',
+ 'physics.atm-clus': 'Atomic and Molecular Clusters',
+ 'physics.atom-ph': 'Atomic Physics',
+ 'physics.bio-ph': 'Biological Physics',
+ 'physics.chem-ph': 'Chemical Physics',
+ 'physics.class-ph': 'Classical Physics',
+ 'physics.comp-ph': 'Computational Physics',
+ 'physics.data-an': 'Data Analysis, Statistics and Probability',
+ 'physics.ed-ph': 'Physics Education',
+ 'physics.flu-dyn': 'Fluid Dynamics',
+ 'physics.gen-ph': 'General Physics',
+ 'physics.geo-ph': 'Geophysics',
+ 'physics.hist-ph': 'History and Philosophy of Physics',
+ 'physics.ins-det': 'Instrumentation and Detectors',
+ 'physics.med-ph': 'Medical Physics',
+ 'physics.optics': 'Optics',
+ 'physics.plasm-ph': 'Plasma Physics',
+ 'physics.pop-ph': 'Popular Physics',
+ 'physics.soc-ph': 'Physics and Society',
+ 'physics.space-ph': 'Space Physics',
+ 'q-bio.BM': 'Biomolecules',
+ 'q-bio.CB': 'Cell Behavior',
+ 'q-bio.GN': 'Genomics',
+ 'q-bio.MN': 'Molecular Networks',
+ 'q-bio.NC': 'Neurons and Cognition',
+ 'q-bio.OT': 'Other Quantitative Biology',
+ 'q-bio.PE': 'Populations and Evolution',
+ 'q-bio.QM': 'Quantitative Methods',
+ 'q-bio.SC': 'Subcellular Processes',
+ 'q-bio.TO': 'Tissues and Organs',
+ 'q-fin.CP': 'Computational Finance',
+ 'q-fin.EC': 'Economics',
+ 'q-fin.GN': 'General Finance',
+ 'q-fin.MF': 'Mathematical Finance',
+ 'q-fin.PM': 'Portfolio Management',
+ 'q-fin.PR': 'Pricing of Securities',
+ 'q-fin.RM': 'Risk Management',
+ 'q-fin.ST': 'Statistical Finance',
+ 'q-fin.TR': 'Trading and Market Microstructure',
+ 'quant-ph': 'Quantum Physics',
+ 'stat.AP': 'Applications',
+ 'stat.CO': 'Computation',
+ 'stat.ME': 'Methodology',
+ 'stat.ML': 'Machine Learning',
+ 'stat.OT': 'Other Statistics',
+ 'stat.TH': 'Statistics Theory'
+}
diff --git a/frontend/src/hooks/useDebounce.ts b/frontend/src/hooks/useDebounce.ts
new file mode 100644
index 0000000..bee4753
--- /dev/null
+++ b/frontend/src/hooks/useDebounce.ts
@@ -0,0 +1,34 @@
+import { useRef, useEffect } from "react";
+
+type Timer = ReturnType;
+type SomeFunction = (...args: any[]) => void;
+/**
+ *
+ * @param func The original, non debounced function (You can pass any number of args to it)
+ * @param delay The delay (in ms) for the function to return
+ * @returns The debounced function, which will run only if the debounced function has not been called in the last (delay) ms
+ */
+
+export function useDebounce(
+ func: Func,
+ delay = 1000
+) {
+ const timer = useRef();
+
+ useEffect(() => {
+ return () => {
+ if (!timer.current) return;
+ clearTimeout(timer.current);
+ };
+ }, []);
+
+ const debouncedFunction = ((...args) => {
+ const newTimer = setTimeout(() => {
+ func(...args);
+ }, delay);
+ clearTimeout(timer.current);
+ timer.current = newTimer;
+ }) as Func;
+
+ return debouncedFunction;
+}
diff --git a/frontend/src/index.tsx b/frontend/src/index.tsx
index dbccbfd..ea05162 100644
--- a/frontend/src/index.tsx
+++ b/frontend/src/index.tsx
@@ -5,7 +5,9 @@ import React from 'react';
import ReactDOM from 'react-dom/client';
import App from './App';
import reportWebVitals from './reportWebVitals';
+import { enableMapSet } from "immer"
+enableMapSet()
const root = ReactDOM.createRoot(
document.getElementById('root') as HTMLElement
diff --git a/frontend/src/types/search.d.ts b/frontend/src/types/search.d.ts
new file mode 100644
index 0000000..8056396
--- /dev/null
+++ b/frontend/src/types/search.d.ts
@@ -0,0 +1,4 @@
+export type SearchStates = string[]
+
+export type OnSearchStateChange = (index: number, newText: string) => void
+export type OnSearchItemRemove = (index: number) => void
diff --git a/frontend/src/ui/AddItemButton/index.tsx b/frontend/src/ui/AddItemButton/index.tsx
new file mode 100644
index 0000000..a2f2acc
--- /dev/null
+++ b/frontend/src/ui/AddItemButton/index.tsx
@@ -0,0 +1,22 @@
+import { Root, IconWrapper } from "./styles";
+import AddIcon from '@mui/icons-material/Add';
+
+interface Props {
+ text: string;
+ onClick: () => void
+}
+
+export const AddItemButton = ({ text, ...props }: Props) => {
+
+ return (
+
+
+
+
+
+
+ {text}
+
+
+ )
+}
diff --git a/frontend/src/ui/AddItemButton/styles.ts b/frontend/src/ui/AddItemButton/styles.ts
new file mode 100644
index 0000000..c6bb2cd
--- /dev/null
+++ b/frontend/src/ui/AddItemButton/styles.ts
@@ -0,0 +1,21 @@
+import styled from 'styled-components'
+
+export const Root = styled.div`
+ display: flex;
+
+ width: 100%;
+ height: 48px;
+ line-height: 46px;
+
+ cursor: pointer;
+ border 1px dashed #555;
+ border-radius: 4px;
+
+ color: #666;
+
+ margin-top: 20px;
+`
+
+export const IconWrapper = styled.div`
+ width: 80px;
+`
diff --git a/frontend/src/ui/CopyToClipboard/index.tsx b/frontend/src/ui/CopyToClipboard/index.tsx
new file mode 100644
index 0000000..3cc0869
--- /dev/null
+++ b/frontend/src/ui/CopyToClipboard/index.tsx
@@ -0,0 +1,31 @@
+import { Snackbar, Link } from '@mui/material'
+import ShareIcon from '@mui/icons-material/Share';
+import { useState } from 'react'
+import { Root } from './styles';
+
+
+const CopyToClipboardButton = () => {
+ const [open, setOpen] = useState(false)
+
+ const handleClick = () => {
+ setOpen(true)
+ navigator.clipboard.writeText(window.location.toString())
+ }
+
+ return (
+ <>
+
+ Share results
+
+
+
+ setOpen(false)}
+ autoHideDuration={2000}
+ message="Copied to clipboard"
+ />
+ >
+ )
+}
+export default CopyToClipboardButton
diff --git a/frontend/src/ui/CopyToClipboard/styles.ts b/frontend/src/ui/CopyToClipboard/styles.ts
new file mode 100644
index 0000000..6258baa
--- /dev/null
+++ b/frontend/src/ui/CopyToClipboard/styles.ts
@@ -0,0 +1,7 @@
+import styled from 'styled-components'
+
+export const Root = styled.div`
+ color: #1976d2;
+ cursor: pointer;
+`
+
diff --git a/frontend/src/ui/LoadingButton/index.tsx b/frontend/src/ui/LoadingButton/index.tsx
new file mode 100644
index 0000000..f2ccdb9
--- /dev/null
+++ b/frontend/src/ui/LoadingButton/index.tsx
@@ -0,0 +1,27 @@
+
+import {
+ Button,
+ CircularProgress
+} from '@mui/material';
+import { ReactElement } from 'react';
+
+interface Props {
+ children: ReactElement | string,
+ onClick: () => void,
+ loading?: boolean;
+}
+
+export const LoadingButton = ({ loading, children, ...props }: Props) => {
+ const innerContent = loading ? : children
+
+ return (
+
+ {innerContent}
+
+ )
+}
diff --git a/frontend/src/ui/MultilineSearchField/index.tsx b/frontend/src/ui/MultilineSearchField/index.tsx
new file mode 100644
index 0000000..219aa07
--- /dev/null
+++ b/frontend/src/ui/MultilineSearchField/index.tsx
@@ -0,0 +1,45 @@
+import { ClearButton, Root, TextArea } from "./styles"
+import ClearIcon from "@material-ui/icons/Clear";
+import { ReactElement } from "react";
+
+interface Props {
+ value: string
+ placeholder: string
+ height?: number
+ onChange: any
+ onCancelSearch: any
+ isCancelEnabled: boolean
+}
+
+export const MultilineSearchField = ({
+ value,
+ placeholder,
+ height = 200,
+ onChange,
+ onCancelSearch,
+ isCancelEnabled
+}: Props) => {
+
+ const handleTextChange = (event: React.ChangeEvent) => {
+ onChange(event.target.value)
+ };
+
+ return (
+
+
+ {isCancelEnabled
+ &&
+
+
+
+ }
+
+ )
+}
diff --git a/frontend/src/ui/MultilineSearchField/styles.ts b/frontend/src/ui/MultilineSearchField/styles.ts
new file mode 100644
index 0000000..7a26ff3
--- /dev/null
+++ b/frontend/src/ui/MultilineSearchField/styles.ts
@@ -0,0 +1,53 @@
+import styled from 'styled-components'
+
+
+interface Props {
+ height?: number
+}
+
+export const Root = styled.div`
+ position: relative;
+ margin-top: 20px;
+
+ border-radius: 4px;
+ padding: 12px 40px 12px 16px;
+
+ width: 100%;
+ height: ${({height = 80}) => height}px;
+
+ box-shadow: rgb(0 0 0 / 20%) 0px 2px 6px 0px, rgb(0 0 0 / 14%) 0px 1px 1px 0px, rgb(0 0 0 / 12%) 0px 1px 3px 0px;
+`
+
+
+export const TextArea = styled.textarea`
+ border: none;
+ overflow: auto;
+ outline: none;
+
+ resize: none;
+
+ width: 100%;
+ height: 100%;
+
+ &::placeholder {
+ color: #aaa;
+ }
+`
+
+
+export const ClearButton = styled.div`
+ position: absolute;
+ width: 40px;
+ height: 40px;
+ padding: 8px;
+ right: 0;
+ top: calc(50% - 20px);
+
+ cursor: pointer;
+
+ display: flex;
+
+ flex-direction: column; /* make main axis vertical */
+ justify-content: center; /* center items vertically, in this case */
+ align-items: center;
+`
diff --git a/frontend/src/ui/SearchBar/index.tsx b/frontend/src/ui/SearchBar/index.tsx
new file mode 100644
index 0000000..e7bb1da
--- /dev/null
+++ b/frontend/src/ui/SearchBar/index.tsx
@@ -0,0 +1,56 @@
+
+// import Bar from "material-ui-search-bar";
+// import ClearIcon from "@material-ui/icons/Clear";
+// import grey from '@material-ui/core/colors/grey';
+import { OnSearchItemRemove, OnSearchStateChange } from "../../types/search";
+import { MultilineSearchField } from "../MultilineSearchField";
+
+interface Props {
+ index: number;
+ text: string;
+ onSearchStateChange: OnSearchStateChange;
+ isRemovalEnabled: boolean;
+ onSearchItemRemove: OnSearchItemRemove;
+}
+
+export const SearchBar = ({
+ index,
+ text,
+ onSearchStateChange,
+ isRemovalEnabled,
+ onSearchItemRemove
+}: Props) => {
+ const handleItemRemove = () => isRemovalEnabled && onSearchItemRemove(index)
+
+ const classes: {[key: string]: string} = {}
+
+ if (!isRemovalEnabled) {
+ classes.iconButton ='search_bar__button--hidden'
+ }
+
+ return (
+
+ {/* onSearchStateChange(index, newValue)}
+ searchIcon={ }
+ onRequestSearch={handleItemRemove} // allow only to delete now
+ onCancelSearch={handleItemRemove}
+
+ style={{
+ margin: '20px 0',
+ boxShadow: '0px 2px 6px 0px rgb(0 0 0 / 20%), 0px 1px 1px 0px rgb(0 0 0 / 14%), 0px 1px 3px 0px rgb(0 0 0 / 12%)'
+ }}
+ classes={classes}
+ /> */}
+ onSearchStateChange(index, newValue)}
+ onCancelSearch={handleItemRemove}
+ isCancelEnabled={isRemovalEnabled}
+ />
+
+ )
+}
diff --git a/frontend/src/utils/query_string.ts b/frontend/src/utils/query_string.ts
new file mode 100644
index 0000000..96347ba
--- /dev/null
+++ b/frontend/src/utils/query_string.ts
@@ -0,0 +1,32 @@
+type ParsedURLSearchParams = { [key: string]: string | string[] }
+
+export const parseURLSearchParams = (params: URLSearchParams): ParsedURLSearchParams => {
+ const result: ParsedURLSearchParams = {}
+
+ Array.from(params.entries()).forEach(param => {
+ const key = param[0]
+ const value = param[1]
+
+ if (result[key] !== undefined) {
+ if (Array.isArray(result[key])) {
+ (result[key] as string[]).push(value)
+ } else {
+ result[key] = [result[key] as string, value]
+ }
+ } else {
+ result[key] = value
+ }
+ })
+
+ return result
+}
+
+
+export const ensureArray = (value: string | string[]): string[] => Array.isArray(value) ? value : [value]
+
+
+export const getArrayParam = (params: ParsedURLSearchParams, key: string, defaultValue?: any) => {
+ const value = params[key] || defaultValue || []
+
+ return ensureArray(value)
+}
diff --git a/frontend/src/views/Card.tsx b/frontend/src/views/Card.tsx
index d0ef58c..872a890 100644
--- a/frontend/src/views/Card.tsx
+++ b/frontend/src/views/Card.tsx
@@ -14,6 +14,7 @@ interface Props {
paperCat: string;
paperYear: number;
categories: string[];
+ matchExactCategories: boolean;
years: string[];
similarity_score: number;
setState: (state: any) => void;
@@ -44,12 +45,14 @@ export const Card = (props: Props) => {
const querySemanticallySimilarPapers = async () => {
try {
- const results = await getSemanticallySimilarPapers(
- props.paperId,
- props.years,
- props.categories,
- "KNN",
- props.numPapers);
+ const results = await getSemanticallySimilarPapers({
+ paper_id: props.paperId,
+ years: props.years,
+ categories: props.categories,
+ limit: props.numPapers,
+ matchExactCategories: props.matchExactCategories
+ })
+
props.setState(results.papers)
props.setTotal(results.total)
} catch (err) {
@@ -130,4 +133,4 @@ export const Card = (props: Props) => {
);
- };
\ No newline at end of file
+ };
diff --git a/frontend/src/views/Home.tsx b/frontend/src/views/Home.tsx
index 146afc3..c718676 100644
--- a/frontend/src/views/Home.tsx
+++ b/frontend/src/views/Home.tsx
@@ -1,236 +1,227 @@
-import { useState, useEffect } from 'react';
-import { getPapers, getSemanticallySimilarPapersbyText } from '../api';
+import { useEffect } from 'react';
+import { useImmer } from "use-immer";
+import { getPapers, getSemanticallySimilarPapersbyText, getSuggestedCategories } from '../api';
import { Card } from "./Card"
-import SearchBar from "material-ui-search-bar";
-
-
-import OutlinedInput from '@mui/material/OutlinedInput';
-import InputLabel from '@mui/material/InputLabel';
-import MenuItem from '@mui/material/MenuItem';
-import FormControl from '@mui/material/FormControl';
-import ListItemText from '@mui/material/ListItemText';
-import Select, { SelectChangeEvent } from '@mui/material/Select';
-import Checkbox from '@mui/material/Checkbox';
-import Tooltip from '@mui/material/Tooltip';
-
-/* eslint-disable jsx-a11y/anchor-is-valid */
-/* eslint-disable @typescript-eslint/no-unused-vars */
-
-interface Props {
- papers: any[];
- setPapers: (state: any) => void;
- categories: string[];
- setCategories: (state: any) => void;
- years: string[];
- setYears: (state: any) => void;
- searchState: string;
- setSearchState: (state: any) => void;
- total: number;
- setTotal: (state: any) => void;
-}
-
-export const Home = (props: Props) => {
- const [error, setError] = useState('');
- const [skip, setSkip] = useState(0);
- const [limit, setLimit] = useState(15);
-
- const ITEM_HEIGHT = 48;
- const ITEM_PADDING_TOP = 8;
- const MenuProps = {
- PaperProps: {
- style: {
- maxHeight: ITEM_HEIGHT * 4.5 + ITEM_PADDING_TOP,
- width: 150,
- },
- },
- };
- const yearOptions = [
- '2022',
- '2021',
- '2020',
- '2019',
- '2018',
- '2017',
- '2016',
- '2015',
- '2014',
- '2013',
- '2012',
- '2011'
- ];
-
- const categoryOptions = [
- 'cs.LG',
- 'math-ph',
- 'quant-ph',
- 'cond-mat.mes-hall',
- 'hep-ph',
- 'hep-th',
- 'gr-qc',
- 'cond-mat.mtrl-sci',
- 'cond-mat.str-el',
- 'cond-mat.stat-mech',
- 'astro-ph.CO',
- 'math.MP',
- 'astro-ph.HE',
- 'physics.optics',
- 'astro-ph.GA'
- ]
-
- const handleSearchChange = async (newValue: string) => {
- props.setSearchState(newValue);
+import {
+ Tooltip,
+ SelectChangeEvent,
+} from '@mui/material';
+
+
+import { SearchStates } from '../types/search';
+import { Search } from '../components/Search';
+import { AddItemButton } from '../ui/AddItemButton';
+
+import { useSearchParams } from 'react-router-dom';
+import { getArrayParam, parseURLSearchParams } from '../utils/query_string';
+import { useDebounce } from '../hooks/useDebounce';
+import { SuggestedCategories } from '../components/SuggestedCategories';
+import { LoadingButton } from '../ui/LoadingButton';
+import { SearchFilters } from '../components/SearchFilters';
+import CopyToClipboardButton from '../ui/CopyToClipboard';
+import { CATEGORY_HUMAN_NAMES } from '../constants/search_filter';
+
+export const Home = () => {
+ const [urlParams, setUrlParams] = useSearchParams();
+
+ const parsed_params = parseURLSearchParams(urlParams)
+
+ const [papers, setPapers] = useImmer([]);
+ const [isLoadingPapers, setIsLoadingPapers] = useImmer(false);
+ const [categories, setCategories] = useImmer(getArrayParam(parsed_params, 'categories', []));
+ const [suggestedCategories, setSuggestedCategories] = useImmer([]);
+ const [matchExactCategories, setMatchExactCategories] = useImmer(false)
+ const [years, setYears] = useImmer(getArrayParam(parsed_params, 'years', []));
+ const [searchStates, setSearchStates] = useImmer(getArrayParam(parsed_params, 'searchStates', ['']));
+ const [total, setTotal] = useImmer(0);
+ const [_error, setError] = useImmer('');
+ const [skip, setSkip] = useImmer(0);
+ const [limit, _setLimit] = useImmer(15);
+
+ const changeSuggestedCategories = async () => {
+ const newSuggestedCategories = await getSuggestedCategories(searchStates)
+
+ setSuggestedCategories(newSuggestedCategories)
}
+ const changeSuggestedCategoriesDebounced = useDebounce(changeSuggestedCategories, 500)
- const handleYearSelection = (event: SelectChangeEvent) => {
+ const queryPapers = async () => {
+ setIsLoadingPapers(true)
+ try {
+ if (searchStates) {
+ const result = await getSemanticallySimilarPapersbyText({ searchItems: searchStates, years, categories })
+ setPapers(result.papers)
+ setTotal(result.total)
+ } else {
+ setSkip(skip + limit);
+ const result = await getPapers(limit, skip, years, categories);
+ setPapers(result.papers)
+ setTotal(result.total)
+ }
+ } catch (err) {
+ setError(String(err));
+ } finally {
+ setIsLoadingPapers(false)
+ }
+ };
+
+ // Execute this one when the component loads up
+ useEffect(() => {
+ queryPapers();
+ // eslint-disable-next-line react-hooks/exhaustive-deps
+ }, []);
+
+ useEffect(() => {
+ setUrlParams({
+ years,
+ categories,
+ searchStates
+ })
+ }, [years, categories, searchStates, setUrlParams])
+
+ const handleYearSelection = (event: SelectChangeEvent) => {
const {
target: { value },
} = event;
- props.setYears(
+ setYears(
// On autofill we get a stringified value.
typeof value === 'string' ? value.split(',') : value,
);
- setSkip(0);
- console.log(props.years)
+ setSkip(0)
};
- const handleCatSelection = (event: SelectChangeEvent) => {
+ const handleCategorySelection = (event: SelectChangeEvent) => {
const {
target: { value },
} = event;
- props.setCategories(
+ setCategories(
// On autofill we get a stringified value.
typeof value === 'string' ? value.split(',') : value,
);
- setSkip(0);
+ setSkip(0)
};
- const queryPapers = async () => {
- try {
- if ( props.searchState ) {
- const result = await getSemanticallySimilarPapersbyText(props.searchState, props.years, props.categories)
- props.setPapers(result.papers)
- props.setTotal(result.total)
- } else {
- setSkip(skip + limit);
- const result = await getPapers(limit, skip, props.years, props.categories);
- props.setPapers(result.papers)
- props.setTotal(result.total)
- }
- } catch (err) {
- setError(String(err));
- }
- };
+ const handleSearchChange = (index: number, newText: string) => {
+ setSearchStates(searchStates => {
+ searchStates[index] = newText
+ })
- // Execute this one when the component loads up
- useEffect(() => {
- props.setPapers([]);
- props.setCategories([]);
- props.setYears([]);
- queryPapers();
- }, []);
+ changeSuggestedCategoriesDebounced()
+ }
+
+ const handleSearchItemAdd = () => {
+ setSearchStates(searchStates => {
+ searchStates.push('')
+ })
+ }
+
+ const handleSearchItemRemove = (index: number) => {
+ setSearchStates(searchStates => {
+ searchStates.splice(index, 1)
+ })
+ }
+
+ const applySuggestedCategories = () => {
+ const mergedCategories = new Set([...categories, ...suggestedCategories])
+
+ setCategories(Array.from(mergedCategories))
+ }
+
+ const handleMatchExactCategoriesChange = () => {
+ setMatchExactCategories((el) => !el)
+ }
return (
<>
-
-
-
arXiv Paper Search
-
- This demo uses the built in Vector Search capabilities of Redis Enterprise
- to show how unstructured data, such as paper abstracts (text), can be used to create a powerful
- search engine.
-
-
- Enter a search query below to discover scholarly papers hosted by arXiv (Cornell University).
-
-
- handleSearchChange(newValue)}
- onRequestSearch={() => queryPapers()}
- style={{
- margin: '20px 0',
- }}
- />
-
-
-
- Year
- }
- renderValue={(selected) => selected.join(', ')}
- MenuProps={MenuProps}
- >
- {yearOptions.map((year) => (
-
- -1} />
-
-
- ))}
-
-
-
- Category
- }
- renderValue={(selected) => selected.join(', ')}
- MenuProps={MenuProps}
- >
- {categoryOptions.map((cat) => (
-
- -1} />
-
-
- ))}
-
-
-
-
-
-
-
-
-
- {props.total} searchable arXiv papers
-
-
-
-
- {props.papers && (
-
- {props.papers.map((paper) => (
-
+
+
arXiv Paper Search
+
+ This demo uses the built in Vector Search capabilities of Redis Enterprise
+ to show how unstructured data, such as paper abstracts (text), can be used to create a powerful
+ search engine.
+
+
+ Enter a search query below to discover scholarly papers hosted by arXiv (Cornell University).
+
+
+
+
+
+
+
+
+
+
+ Search!
+
+
+
`${slug} (${CATEGORY_HUMAN_NAMES[slug as keyof typeof CATEGORY_HUMAN_NAMES]})`)}
+ onClick={applySuggestedCategories}
+ />
+
+
+
+
+
+
+
+
+
+ {total} searchable arXiv papers
+
+
+
+
+
+
+
+
+
+ {papers && (
+
+ {papers.map((paper) => (
+
))}
-
+
)}
>
);
-};
\ No newline at end of file
+};
diff --git a/frontend/yarn.lock b/frontend/yarn.lock
index 3bc3967..0ab2707 100644
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@@ -26,6 +26,13 @@
dependencies:
"@babel/highlight" "^7.16.7"
+"@babel/code-frame@^7.18.6":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.18.6.tgz#3b25d38c89600baa2dcc219edfa88a74eb2c427a"
+ integrity sha512-TDCmlK5eOvH+eH7cdAFlNXeVJqWIQ7gW9tY1GJIpUtFb6CmjVyq2VM3u71bOyR8CRihcCgMUYoDNyLXao3+70Q==
+ dependencies:
+ "@babel/highlight" "^7.18.6"
+
"@babel/compat-data@^7.13.11", "@babel/compat-data@^7.17.10":
version "7.18.5"
resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.18.5.tgz#acac0c839e317038c73137fbb6ef71a1d6238471"
@@ -70,6 +77,22 @@
"@jridgewell/gen-mapping" "^0.3.0"
jsesc "^2.5.1"
+"@babel/generator@^7.20.1":
+ version "7.20.1"
+ resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.20.1.tgz#ef32ecd426222624cbd94871a7024639cf61a9fa"
+ integrity sha512-u1dMdBUmA7Z0rBB97xh8pIhviK7oItYOkjbsCxTWMknyvbQRBwX7/gn4JXurRdirWMFh+ZtYARqkA6ydogVZpg==
+ dependencies:
+ "@babel/types" "^7.20.0"
+ "@jridgewell/gen-mapping" "^0.3.2"
+ jsesc "^2.5.1"
+
+"@babel/helper-annotate-as-pure@^7.16.0":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.18.6.tgz#eaa49f6f80d5a33f9a5dd2276e6d6e451be0a6bb"
+ integrity sha512-duORpUiYrEpzKIop6iNbjnwKLAKnJ47csTyRACyEmWj0QdUrm5aqNJGHSSEQSUAvNW0ojX0dOmK9dZduvkfeXA==
+ dependencies:
+ "@babel/types" "^7.18.6"
+
"@babel/helper-annotate-as-pure@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.16.7.tgz#bb2339a7534a9c128e3102024c60760a3a7f3862"
@@ -135,6 +158,11 @@
resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.2.tgz#8a6d2dedb53f6bf248e31b4baf38739ee4a637bd"
integrity sha512-14GQKWkX9oJzPiQQ7/J36FTXcD4kSp8egKjO9nINlSKiHITRA9q/R74qu8S9xlc/b/yjsJItQUeeh3xnGN0voQ==
+"@babel/helper-environment-visitor@^7.18.9":
+ version "7.18.9"
+ resolved "https://registry.yarnpkg.com/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz#0c0cee9b35d2ca190478756865bb3528422f51be"
+ integrity sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==
+
"@babel/helper-explode-assignable-expression@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-explode-assignable-expression/-/helper-explode-assignable-expression-7.16.7.tgz#12a6d8522fdd834f194e868af6354e8650242b7a"
@@ -150,6 +178,14 @@
"@babel/template" "^7.16.7"
"@babel/types" "^7.17.0"
+"@babel/helper-function-name@^7.19.0":
+ version "7.19.0"
+ resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.19.0.tgz#941574ed5390682e872e52d3f38ce9d1bef4648c"
+ integrity sha512-WAwHBINyrpqywkUH0nTnNgI5ina5TFn85HKS0pbPDfxFfhyR/aNQEn4hGi1P1JyT//I0t4OgXUlofzWILRvS5w==
+ dependencies:
+ "@babel/template" "^7.18.10"
+ "@babel/types" "^7.19.0"
+
"@babel/helper-hoist-variables@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.16.7.tgz#86bcb19a77a509c7b77d0e22323ef588fa58c246"
@@ -157,6 +193,13 @@
dependencies:
"@babel/types" "^7.16.7"
+"@babel/helper-hoist-variables@^7.18.6":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz#d4d2c8fb4baeaa5c68b99cc8245c56554f926678"
+ integrity sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==
+ dependencies:
+ "@babel/types" "^7.18.6"
+
"@babel/helper-member-expression-to-functions@^7.17.7":
version "7.17.7"
resolved "https://registry.yarnpkg.com/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.17.7.tgz#a34013b57d8542a8c4ff8ba3f747c02452a4d8c4"
@@ -164,6 +207,13 @@
dependencies:
"@babel/types" "^7.17.0"
+"@babel/helper-module-imports@^7.0.0", "@babel/helper-module-imports@^7.16.0":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/helper-module-imports/-/helper-module-imports-7.18.6.tgz#1e3ebdbbd08aad1437b428c50204db13c5a3ca6e"
+ integrity sha512-0NFvs3VkuSYbFi1x2Vd6tKrywq+z/cLeYC/RJNFrIX/30Bf5aiGYbtvGXolEktzJH8o5E5KJ3tT+nkxuuZFVlA==
+ dependencies:
+ "@babel/types" "^7.18.6"
+
"@babel/helper-module-imports@^7.10.4", "@babel/helper-module-imports@^7.12.13", "@babel/helper-module-imports@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-module-imports/-/helper-module-imports-7.16.7.tgz#25612a8091a999704461c8a222d0efec5d091437"
@@ -238,11 +288,28 @@
dependencies:
"@babel/types" "^7.16.7"
+"@babel/helper-split-export-declaration@^7.18.6":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz#7367949bc75b20c6d5a5d4a97bba2824ae8ef075"
+ integrity sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==
+ dependencies:
+ "@babel/types" "^7.18.6"
+
+"@babel/helper-string-parser@^7.19.4":
+ version "7.19.4"
+ resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.19.4.tgz#38d3acb654b4701a9b77fb0615a96f775c3a9e63"
+ integrity sha512-nHtDoQcuqFmwYNYPz3Rah5ph2p8PFeFCsZk9A/48dPc/rGocJ5J3hAAZ7pb76VWX3fZKu+uEr/FhH5jLx7umrw==
+
"@babel/helper-validator-identifier@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.16.7.tgz#e8c602438c4a8195751243da9031d1607d247cad"
integrity sha512-hsEnFemeiW4D08A5gUAZxLBTXpZ39P+a+DGDsHw1yxqyQ/jzFEnxf5uTEGp+3bzAbNOxU1paTgYS4ECU/IgfDw==
+"@babel/helper-validator-identifier@^7.18.6", "@babel/helper-validator-identifier@^7.19.1":
+ version "7.19.1"
+ resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz#7eea834cf32901ffdc1a7ee555e2f9c27e249ca2"
+ integrity sha512-awrNfaMtnHUr653GgGEs++LlAvW6w+DcPrOliSMXWCKo597CwL5Acf/wWdNkf/tfEQE3mjkeD1YOVZOUV/od1w==
+
"@babel/helper-validator-option@^7.16.7":
version "7.16.7"
resolved "https://registry.yarnpkg.com/@babel/helper-validator-option/-/helper-validator-option-7.16.7.tgz#b203ce62ce5fe153899b617c08957de860de4d23"
@@ -276,11 +343,25 @@
chalk "^2.0.0"
js-tokens "^4.0.0"
+"@babel/highlight@^7.18.6":
+ version "7.18.6"
+ resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.18.6.tgz#81158601e93e2563795adcbfbdf5d64be3f2ecdf"
+ integrity sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==
+ dependencies:
+ "@babel/helper-validator-identifier" "^7.18.6"
+ chalk "^2.0.0"
+ js-tokens "^4.0.0"
+
"@babel/parser@^7.1.0", "@babel/parser@^7.14.7", "@babel/parser@^7.16.7", "@babel/parser@^7.18.5":
version "7.18.5"
resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.18.5.tgz#337062363436a893a2d22faa60be5bb37091c83c"
integrity sha512-YZWVaglMiplo7v8f1oMQ5ZPQr0vn7HPeZXxXWsxXJRjGVrzUFn9OxFQl1sb5wzfootjA/yChhW84BV+383FSOw==
+"@babel/parser@^7.18.10", "@babel/parser@^7.20.1":
+ version "7.20.1"
+ resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.20.1.tgz#3e045a92f7b4623cafc2425eddcb8cf2e54f9cc5"
+ integrity sha512-hp0AYxaZJhxULfM1zyp7Wgr+pSUKBcP3M+PHnSzWGdXOzg/kHWIgiUWARvubhUKGOEw3xqY4x+lyZ9ytBVcELw==
+
"@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression@^7.17.12":
version "7.17.12"
resolved "https://registry.yarnpkg.com/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.17.12.tgz#1dca338caaefca368639c9ffb095afbd4d420b1e"
@@ -1042,6 +1123,15 @@
"@babel/parser" "^7.16.7"
"@babel/types" "^7.16.7"
+"@babel/template@^7.18.10":
+ version "7.18.10"
+ resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.18.10.tgz#6f9134835970d1dbf0835c0d100c9f38de0c5e71"
+ integrity sha512-TI+rCtooWHr3QJ27kJxfjutghu44DLnasDMwpDqCXVTal9RLp3RSYNh4NdBrRP2cQAoG9A8juOQl6P6oZG4JxA==
+ dependencies:
+ "@babel/code-frame" "^7.18.6"
+ "@babel/parser" "^7.18.10"
+ "@babel/types" "^7.18.10"
+
"@babel/traverse@^7.13.0", "@babel/traverse@^7.16.8", "@babel/traverse@^7.18.0", "@babel/traverse@^7.18.2", "@babel/traverse@^7.18.5", "@babel/traverse@^7.7.2":
version "7.18.5"
resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.18.5.tgz#94a8195ad9642801837988ab77f36e992d9a20cd"
@@ -1058,6 +1148,22 @@
debug "^4.1.0"
globals "^11.1.0"
+"@babel/traverse@^7.4.5":
+ version "7.20.1"
+ resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.20.1.tgz#9b15ccbf882f6d107eeeecf263fbcdd208777ec8"
+ integrity sha512-d3tN8fkVJwFLkHkBN479SOsw4DMZnz8cdbL/gvuDuzy3TS6Nfw80HuQqhw1pITbIruHyh7d1fMA47kWzmcUEGA==
+ dependencies:
+ "@babel/code-frame" "^7.18.6"
+ "@babel/generator" "^7.20.1"
+ "@babel/helper-environment-visitor" "^7.18.9"
+ "@babel/helper-function-name" "^7.19.0"
+ "@babel/helper-hoist-variables" "^7.18.6"
+ "@babel/helper-split-export-declaration" "^7.18.6"
+ "@babel/parser" "^7.20.1"
+ "@babel/types" "^7.20.0"
+ debug "^4.1.0"
+ globals "^11.1.0"
+
"@babel/types@^7.0.0", "@babel/types@^7.12.6", "@babel/types@^7.16.0", "@babel/types@^7.16.7", "@babel/types@^7.16.8", "@babel/types@^7.17.0", "@babel/types@^7.17.12", "@babel/types@^7.18.0", "@babel/types@^7.18.2", "@babel/types@^7.18.4", "@babel/types@^7.3.0", "@babel/types@^7.3.3", "@babel/types@^7.4.4":
version "7.18.4"
resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.18.4.tgz#27eae9b9fd18e9dccc3f9d6ad051336f307be354"
@@ -1066,6 +1172,15 @@
"@babel/helper-validator-identifier" "^7.16.7"
to-fast-properties "^2.0.0"
+"@babel/types@^7.18.10", "@babel/types@^7.18.6", "@babel/types@^7.19.0", "@babel/types@^7.20.0":
+ version "7.20.0"
+ resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.20.0.tgz#52c94cf8a7e24e89d2a194c25c35b17a64871479"
+ integrity sha512-Jlgt3H0TajCW164wkTOTzHkZb075tMQMULzrLUoUeKmO7eFL96GgDxf7/Axhc5CAuKE3KFyVW1p6ysKsi2oXAg==
+ dependencies:
+ "@babel/helper-string-parser" "^7.19.4"
+ "@babel/helper-validator-identifier" "^7.19.1"
+ to-fast-properties "^2.0.0"
+
"@bcoe/v8-coverage@^0.2.3":
version "0.2.3"
resolved "https://registry.yarnpkg.com/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz#75a2e8b51cb758a7553d6804a5932d7aace75c39"
@@ -1202,6 +1317,13 @@
resolved "https://registry.yarnpkg.com/@emotion/hash/-/hash-0.8.0.tgz#bbbff68978fefdbe68ccb533bc8cbe1d1afb5413"
integrity sha512-kBJtf7PH6aWwZ6fka3zQ0p6SBYzx4fl1LoZXE2RrnYST9Xljm7WfKJrU4g/Xr3Beg72MLrp1AWNUmuYJTL7Cow==
+"@emotion/is-prop-valid@^1.1.0":
+ version "1.2.0"
+ resolved "https://registry.yarnpkg.com/@emotion/is-prop-valid/-/is-prop-valid-1.2.0.tgz#7f2d35c97891669f7e276eb71c83376a5dc44c83"
+ integrity sha512-3aDpDprjM0AwaxGE09bOPkNxHpBd+kA6jty3RnaEXdweX1DF1U3VQpPYb0g1IStAuK7SVQ1cy+bNBBKp4W3Fjg==
+ dependencies:
+ "@emotion/memoize" "^0.8.0"
+
"@emotion/is-prop-valid@^1.1.2", "@emotion/is-prop-valid@^1.1.3":
version "1.1.3"
resolved "https://registry.yarnpkg.com/@emotion/is-prop-valid/-/is-prop-valid-1.1.3.tgz#f0907a416368cf8df9e410117068e20fe87c0a3a"
@@ -1214,6 +1336,11 @@
resolved "https://registry.yarnpkg.com/@emotion/memoize/-/memoize-0.7.5.tgz#2c40f81449a4e554e9fc6396910ed4843ec2be50"
integrity sha512-igX9a37DR2ZPGYtV6suZ6whr8pTFtyHL3K/oLUotxpSVO2ASaprmAe2Dkq7tBo7CRY7MMDrAa9nuQP9/YG8FxQ==
+"@emotion/memoize@^0.8.0":
+ version "0.8.0"
+ resolved "https://registry.yarnpkg.com/@emotion/memoize/-/memoize-0.8.0.tgz#f580f9beb67176fa57aae70b08ed510e1b18980f"
+ integrity sha512-G/YwXTkv7Den9mXDO7AhLWkE3q+I92B+VqAE+dYG4NGPaHZGvt3G8Q0p9vmE+sq7rTGphUbAvmQ9YpbfMQGGlA==
+
"@emotion/react@^11.4.1", "@emotion/react@^11.9.3":
version "11.9.3"
resolved "https://registry.yarnpkg.com/@emotion/react/-/react-11.9.3.tgz#f4f4f34444f6654a2e550f5dab4f2d360c101df9"
@@ -1254,7 +1381,12 @@
"@emotion/serialize" "^1.0.4"
"@emotion/utils" "^1.1.0"
-"@emotion/unitless@^0.7.5":
+"@emotion/stylis@^0.8.4":
+ version "0.8.5"
+ resolved "https://registry.yarnpkg.com/@emotion/stylis/-/stylis-0.8.5.tgz#deacb389bd6ee77d1e7fcaccce9e16c5c7e78e04"
+ integrity sha512-h6KtPihKFn3T9fuIrwvXXUOwlx3rfUvfZIcP5a6rh8Y7zjE3O06hT5Ss4S/YI1AYhuZ1kjaE/5EaOOI2NqSylQ==
+
+"@emotion/unitless@^0.7.4", "@emotion/unitless@^0.7.5":
version "0.7.5"
resolved "https://registry.yarnpkg.com/@emotion/unitless/-/unitless-0.7.5.tgz#77211291c1900a700b8a78cfafda3160d76949ed"
integrity sha512-OWORNpfjMsSSUBVrRBVGECkhWcULOAJz9ZW8uK9qgxD+87M7jHRcvh/A96XXNhXTLmKcoYSQtBEX7lHMO7YRwg==
@@ -1541,6 +1673,15 @@
"@jridgewell/sourcemap-codec" "^1.4.10"
"@jridgewell/trace-mapping" "^0.3.9"
+"@jridgewell/gen-mapping@^0.3.2":
+ version "0.3.2"
+ resolved "https://registry.yarnpkg.com/@jridgewell/gen-mapping/-/gen-mapping-0.3.2.tgz#c1aedc61e853f2bb9f5dfe6d4442d3b565b253b9"
+ integrity sha512-mh65xKQAzI6iBcFzwv28KVWSmCkdRBWoOh+bYQGW3+6OZvbbN3TqMGo5hqYxQniRcH9F2VZIoJCm4pa3BPDK/A==
+ dependencies:
+ "@jridgewell/set-array" "^1.0.1"
+ "@jridgewell/sourcemap-codec" "^1.4.10"
+ "@jridgewell/trace-mapping" "^0.3.9"
+
"@jridgewell/resolve-uri@^3.0.3":
version "3.0.7"
resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.0.7.tgz#30cd49820a962aff48c8fffc5cd760151fca61fe"
@@ -1551,6 +1692,11 @@
resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.1.1.tgz#36a6acc93987adcf0ba50c66908bd0b70de8afea"
integrity sha512-Ct5MqZkLGEXTVmQYbGtx9SVqD2fqwvdubdps5D3djjAkgkKwT918VNOz65pEHFaYTeWcukmJmH5SwsA9Tn2ObQ==
+"@jridgewell/set-array@^1.0.1":
+ version "1.1.2"
+ resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.1.2.tgz#7c6cf998d6d20b914c0a55a91ae928ff25965e72"
+ integrity sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw==
+
"@jridgewell/source-map@^0.3.2":
version "0.3.2"
resolved "https://registry.yarnpkg.com/@jridgewell/source-map/-/source-map-0.3.2.tgz#f45351aaed4527a298512ec72f81040c998580fb"
@@ -2161,6 +2307,14 @@
dependencies:
"@types/node" "*"
+"@types/hoist-non-react-statics@*":
+ version "3.3.1"
+ resolved "https://registry.yarnpkg.com/@types/hoist-non-react-statics/-/hoist-non-react-statics-3.3.1.tgz#1124aafe5118cb591977aeb1ceaaed1070eb039f"
+ integrity sha512-iMIqiko6ooLrTh1joXodJK5X9xeEALT1kM5G3ZLhD3hszxBdIEd5C75U834D9mLcINgD4OyZf5uQXjkuYydWvA==
+ dependencies:
+ "@types/react" "*"
+ hoist-non-react-statics "^3.3.0"
+
"@types/html-minifier-terser@^6.0.0":
version "6.1.0"
resolved "https://registry.yarnpkg.com/@types/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz#4fc33a00c1d0c16987b1a20cf92d20614c55ac35"
@@ -2337,6 +2491,15 @@
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.1.tgz#20f18294f797f2209b5f65c8e3b5c8e8261d127c"
integrity sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==
+"@types/styled-components@^5.1.26":
+ version "5.1.26"
+ resolved "https://registry.yarnpkg.com/@types/styled-components/-/styled-components-5.1.26.tgz#5627e6812ee96d755028a98dae61d28e57c233af"
+ integrity sha512-KuKJ9Z6xb93uJiIyxo/+ksS7yLjS1KzG6iv5i78dhVg/X3u5t1H7juRWqVmodIdz6wGVaIApo1u01kmFRdJHVw==
+ dependencies:
+ "@types/hoist-non-react-statics" "*"
+ "@types/react" "*"
+ csstype "^3.0.2"
+
"@types/testing-library__jest-dom@^5.9.1":
version "5.14.5"
resolved "https://registry.yarnpkg.com/@types/testing-library__jest-dom/-/testing-library__jest-dom-5.14.5.tgz#d113709c90b3c75fdb127ec338dad7d5f86c974f"
@@ -3035,6 +3198,22 @@ babel-plugin-polyfill-regenerator@^0.3.0:
dependencies:
"@babel/helper-define-polyfill-provider" "^0.3.1"
+"babel-plugin-styled-components@>= 1.12.0":
+ version "2.0.7"
+ resolved "https://registry.yarnpkg.com/babel-plugin-styled-components/-/babel-plugin-styled-components-2.0.7.tgz#c81ef34b713f9da2b7d3f5550df0d1e19e798086"
+ integrity sha512-i7YhvPgVqRKfoQ66toiZ06jPNA3p6ierpfUuEWxNF+fV27Uv5gxBkf8KZLHUCc1nFA9j6+80pYoIpqCeyW3/bA==
+ dependencies:
+ "@babel/helper-annotate-as-pure" "^7.16.0"
+ "@babel/helper-module-imports" "^7.16.0"
+ babel-plugin-syntax-jsx "^6.18.0"
+ lodash "^4.17.11"
+ picomatch "^2.3.0"
+
+babel-plugin-syntax-jsx@^6.18.0:
+ version "6.18.0"
+ resolved "https://registry.yarnpkg.com/babel-plugin-syntax-jsx/-/babel-plugin-syntax-jsx-6.18.0.tgz#0af32a9a6e13ca7a3fd5069e62d7b0f58d0d8946"
+ integrity sha512-qrPaCSo9c8RHNRHIotaufGbuOBN8rtdC4QrrFFc43vyWCCz7Kl7GL1PGaXtMGQZUXrkCjNEgxDfmAuAabr/rlw==
+
babel-plugin-transform-react-remove-prop-types@^0.4.24:
version "0.4.24"
resolved "https://registry.yarnpkg.com/babel-plugin-transform-react-remove-prop-types/-/babel-plugin-transform-react-remove-prop-types-0.4.24.tgz#f2edaf9b4c6a5fbe5c1d678bfb531078c1555f3a"
@@ -3293,6 +3472,11 @@ camelcase@^6.2.0, camelcase@^6.2.1:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
+camelize@^1.0.0:
+ version "1.0.1"
+ resolved "https://registry.yarnpkg.com/camelize/-/camelize-1.0.1.tgz#89b7e16884056331a35d6b5ad064332c91daa6c3"
+ integrity sha512-dU+Tx2fsypxTgtLoE36npi3UqcjSSMNYfkqgmoEhtZrraP5VWq0K7FkWVTYa8eMPtnU/G2txVsfdCJTn9uzpuQ==
+
caniuse-api@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/caniuse-api/-/caniuse-api-3.0.0.tgz#5e4d90e2274961d46291997df599e3ed008ee4c0"
@@ -3654,6 +3838,11 @@ css-blank-pseudo@^3.0.3:
dependencies:
postcss-selector-parser "^6.0.9"
+css-color-keywords@^1.0.0:
+ version "1.0.0"
+ resolved "https://registry.yarnpkg.com/css-color-keywords/-/css-color-keywords-1.0.0.tgz#fea2616dc676b2962686b3af8dbdbe180b244e05"
+ integrity sha512-FyyrDHZKEjXDpNJYvVsV960FiqQyXc/LlYmsxl2BcdMb2WPx0OGRVgTg55rPSyLSNMqP52R9r8geSp7apN3Ofg==
+
css-declaration-sorter@^6.3.0:
version "6.3.0"
resolved "https://registry.yarnpkg.com/css-declaration-sorter/-/css-declaration-sorter-6.3.0.tgz#72ebd995c8f4532ff0036631f7365cce9759df14"
@@ -3736,6 +3925,15 @@ css-select@^4.1.3:
domutils "^2.8.0"
nth-check "^2.0.1"
+css-to-react-native@^3.0.0:
+ version "3.0.0"
+ resolved "https://registry.yarnpkg.com/css-to-react-native/-/css-to-react-native-3.0.0.tgz#62dbe678072a824a689bcfee011fc96e02a7d756"
+ integrity sha512-Ro1yETZA813eoyUp2GDBhG2j+YggidUmzO1/v9eYBKR2EHVEniE2MI/NqpTQ954BMpTPZFsGNPm46qFB9dpaPQ==
+ dependencies:
+ camelize "^1.0.0"
+ css-color-keywords "^1.0.0"
+ postcss-value-parser "^4.0.2"
+
css-tree@1.0.0-alpha.37:
version "1.0.0-alpha.37"
resolved "https://registry.yarnpkg.com/css-tree/-/css-tree-1.0.0-alpha.37.tgz#98bebd62c4c1d9f960ec340cf9f7522e30709a22"
@@ -5169,7 +5367,7 @@ hoist-non-react-statics@^2.3.1:
resolved "https://registry.yarnpkg.com/hoist-non-react-statics/-/hoist-non-react-statics-2.5.5.tgz#c5903cf409c0dfd908f388e619d86b9c1174cb47"
integrity sha512-rqcy4pJo55FTTLWt+bU8ukscqHeE/e9KWvsOW2b/a3afxQZhwkQdT1rPPCJ0rYXdj4vNcasY8zHTH+jF/qStxw==
-hoist-non-react-statics@^3.3.1, hoist-non-react-statics@^3.3.2:
+hoist-non-react-statics@^3.0.0, hoist-non-react-statics@^3.3.0, hoist-non-react-statics@^3.3.1, hoist-non-react-statics@^3.3.2:
version "3.3.2"
resolved "https://registry.yarnpkg.com/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz#ece0acaf71d62c2969c2ec59feff42a4b1a85b45"
integrity sha512-/gGivxi8JPKWNm/W0jSmzcMPpfpPLc3dY/6GxhX2hQ9iGj3aDfklV4ET7NjKpSinLpJ5vafa9iiGIEZg10SfBw==
@@ -5356,6 +5554,11 @@ ignore@^5.2.0:
resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.2.0.tgz#6d3bac8fa7fe0d45d9f9be7bac2fc279577e345a"
integrity sha512-CmxgYGiEPCLhfLnpPp1MoRmifwEIOgjcHXxOBjv7mY96c+eWScsOP9c112ZyLdWHi0FxHjI+4uVhKYp/gcdRmQ==
+immer@^9.0.16:
+ version "9.0.16"
+ resolved "https://registry.yarnpkg.com/immer/-/immer-9.0.16.tgz#8e7caab80118c2b54b37ad43e05758cdefad0198"
+ integrity sha512-qenGE7CstVm1NrHQbMh8YaSzTZTFNP3zPqr3YU0S0UY441j4bJTg4A2Hh5KAhwgaiU6ZZ1Ar6y/2f4TblnMReQ==
+
immer@^9.0.7:
version "9.0.15"
resolved "https://registry.yarnpkg.com/immer/-/immer-9.0.15.tgz#0b9169e5b1d22137aba7d43f8a81a495dd1b62dc"
@@ -6554,7 +6757,7 @@ lodash.uniq@^4.5.0:
resolved "https://registry.yarnpkg.com/lodash.uniq/-/lodash.uniq-4.5.0.tgz#d0225373aeb652adc1bc82e4945339a842754773"
integrity sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==
-lodash@^4.17.15, lodash@^4.17.20, lodash@^4.17.21, lodash@^4.7.0, lodash@~4.17.5:
+lodash@^4.17.11, lodash@^4.17.15, lodash@^4.17.20, lodash@^4.17.21, lodash@^4.7.0, lodash@~4.17.5:
version "4.17.21"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==
@@ -7203,7 +7406,7 @@ picocolors@^1.0.0:
resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c"
integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==
-picomatch@^2.0.4, picomatch@^2.2.1, picomatch@^2.2.2, picomatch@^2.2.3, picomatch@^2.3.1:
+picomatch@^2.0.4, picomatch@^2.2.1, picomatch@^2.2.2, picomatch@^2.2.3, picomatch@^2.3.0, picomatch@^2.3.1:
version "2.3.1"
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
integrity sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==
@@ -7763,7 +7966,7 @@ postcss-unique-selectors@^5.1.1:
dependencies:
postcss-selector-parser "^6.0.5"
-postcss-value-parser@^4.0.0, postcss-value-parser@^4.1.0, postcss-value-parser@^4.2.0:
+postcss-value-parser@^4.0.0, postcss-value-parser@^4.0.2, postcss-value-parser@^4.1.0, postcss-value-parser@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz#723c09920836ba6d3e5af019f92bc0971c02e514"
integrity sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==
@@ -8704,6 +8907,11 @@ setprototypeof@1.2.0:
resolved "https://registry.yarnpkg.com/setprototypeof/-/setprototypeof-1.2.0.tgz#66c9a24a73f9fc28cbe66b09fed3d33dcaf1b424"
integrity sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==
+shallowequal@^1.1.0:
+ version "1.1.0"
+ resolved "https://registry.yarnpkg.com/shallowequal/-/shallowequal-1.1.0.tgz#188d521de95b9087404fd4dcb68b13df0ae4e7f8"
+ integrity sha512-y0m1JoUZSlPAjXVtPPW70aZWfIL/dSP7AFkRnniLCrK/8MDKog3TySTBmckD+RObVxH0v4Tox67+F14PdED2oQ==
+
shebang-command@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea"
@@ -9036,6 +9244,22 @@ style-loader@^3.3.1:
resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-3.3.1.tgz#057dfa6b3d4d7c7064462830f9113ed417d38575"
integrity sha512-GPcQ+LDJbrcxHORTRes6Jy2sfvK2kS6hpSfI/fXhPt+spVzxF6LJ1dHLN9zIGmVaaP044YKaIatFaufENRiDoQ==
+styled-components@^5.3.6:
+ version "5.3.6"
+ resolved "https://registry.yarnpkg.com/styled-components/-/styled-components-5.3.6.tgz#27753c8c27c650bee9358e343fc927966bfd00d1"
+ integrity sha512-hGTZquGAaTqhGWldX7hhfzjnIYBZ0IXQXkCYdvF1Sq3DsUaLx6+NTHC5Jj1ooM2F68sBiVz3lvhfwQs/S3l6qg==
+ dependencies:
+ "@babel/helper-module-imports" "^7.0.0"
+ "@babel/traverse" "^7.4.5"
+ "@emotion/is-prop-valid" "^1.1.0"
+ "@emotion/stylis" "^0.8.4"
+ "@emotion/unitless" "^0.7.4"
+ babel-plugin-styled-components ">= 1.12.0"
+ css-to-react-native "^3.0.0"
+ hoist-non-react-statics "^3.0.0"
+ shallowequal "^1.1.0"
+ supports-color "^5.5.0"
+
stylehacks@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/stylehacks/-/stylehacks-5.1.0.tgz#a40066490ca0caca04e96c6b02153ddc39913520"
@@ -9049,7 +9273,7 @@ stylis@4.0.13:
resolved "https://registry.yarnpkg.com/stylis/-/stylis-4.0.13.tgz#f5db332e376d13cc84ecfe5dace9a2a51d954c91"
integrity sha512-xGPXiFVl4YED9Jh7Euv2V220mriG9u4B2TA6Ybjc1catrstKD2PpIdU3U0RKpkVBC2EhmL/F0sPCr9vrFTNRag==
-supports-color@^5.3.0:
+supports-color@^5.3.0, supports-color@^5.5.0:
version "5.5.0"
resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-5.5.0.tgz#e2e69a44ac8772f78a1ec0b35b689df6530efc8f"
integrity sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==
@@ -9475,6 +9699,11 @@ uri-js@^4.2.2:
dependencies:
punycode "^2.1.0"
+use-immer@^0.7.0:
+ version "0.7.0"
+ resolved "https://registry.yarnpkg.com/use-immer/-/use-immer-0.7.0.tgz#e3bfbb806b5e3ff6e37441be74c306d91c1e0962"
+ integrity sha512-Re4hjrP3a/2ABZjAc0b7AK9s626bnO+H33RO2VUhiDZ2StBz5B663K6WNNlr4QtHWaGUmvLpwt3whFvvWuolQw==
+
util-deprecate@^1.0.1, util-deprecate@^1.0.2, util-deprecate@~1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
diff --git a/install-local.sh b/install-local.sh
new file mode 100755
index 0000000..a4e0f22
--- /dev/null
+++ b/install-local.sh
@@ -0,0 +1,16 @@
+set -eux
+
+rm -rf backend/vecsim_app/templates/build # delete previously built frontend
+
+cd ./frontend
+yarn install --no-optional
+yarn build
+
+python -m pip install --upgrade pip setuptools wheel
+
+cd ../backend
+pip install -e .
+cp -r ../frontend/build vecsim_app/templates/
+
+cd ./vecsim_app
+chmod +x entrypoint.sh
diff --git a/run-local.sh b/run-local.sh
new file mode 100755
index 0000000..adbf589
--- /dev/null
+++ b/run-local.sh
@@ -0,0 +1,15 @@
+set -eux
+
+pwd0=$(pwd) # initial working dir
+DOCKER_COMPOSE=${DOCKER_COMPOSE:-docker compose} # on some Mac machines, it's docker-compose
+$DOCKER_COMPOSE -f docker-local-redis.yml up --detach --wait redis-vector-db
+function stop() { $DOCKER_COMPOSE -f $pwd0/docker-local-redis.yml down ;}
+trap stop EXIT # stop containers when finish
+
+cd ./backend/vecsim_app
+export DEPLOYMENT="dev"
+export REDIS_HOST="localhost"
+export REDIS_PORT=6379
+export REDIS_PASSWORD="testing123"
+export REDIS_DB=0
+./entrypoint.sh