Skip to content

Commit

Permalink
Merge pull request #19 from climatepolicyradar/feature/rnd-931-make-i…
Browse files Browse the repository at this point in the history
…mportable

make embeddings generation function importable
  • Loading branch information
kdutia authored Feb 14, 2024
2 parents 6f4bacd + 299a209 commit a37d774
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
26 changes: 25 additions & 1 deletion cli/text2embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,33 @@ def run_as_cli(
encoding for files that have already been parsed. By default, files with IDs that
already exist in the output directory are skipped. limit (Optional[int]):
Optionally limit the number of text samples to process. Useful for debugging.
device (str): Device to use for embeddings generation. Must be either "cuda", "mps",
device (str): Device to use for embeddings generation. Must be either "cuda", "mps",
or "cpu".
"""

return run_embeddings_generation(
input_dir=input_dir,
output_dir=output_dir,
s3=s3,
redo=redo,
device=device,
limit=limit,
)


def run_embeddings_generation(
input_dir: str,
output_dir: str,
s3: bool,
redo: bool,
device: str,
limit: Optional[int],
):
"""
Run CLI to produce embeddings from document parser JSON outputs.
See docstring for run_as_cli for details.
"""
# FIXME: This solution assumes that we have a json document with language = en (
# supported target language) for every document in the parser output. This isn't
# very robust. This solution also requires passing every document into the
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description = ""
authors = ["CPR Tech <[email protected]>"]

[tool.poetry.dependencies]
python = "~3.9"
python = "^3.9"
sentence-transformers = "^2.2.0"
huggingface_hub = ">=0.14.0,<1.0.0"
click = "^8.0.4"
Expand Down

0 comments on commit a37d774

Please sign in to comment.