Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] dandi search command #1126

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions dandi/cli/cmd_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
from pathlib import Path

from SPARQLWrapper import JSON, SPARQLWrapper
import click
import pandas as pd

from .base import map_to_click_exceptions

# supported fields
# TODO: adding more
DANDISETS_FIELDS = {
"approach": ["apr", "?as dandi:approach / schema:name ?apr ."],
"species_id": ["sid", "?as dandi:species / schema:identifier ?sid ."],
"species_name": ["snm", "?as dandi:species / schema:name ?snm ."],
}

ASSETS_FIELDS = {
"size": ["size", "?asset schema:contentSize ?size ."],
"format": ["format", "?asset schema:encodingFormat ?format ."],
"age": ["age", "?asset prov:wasAttributedTo / dandi:age / schema:value ?age ."],
}


def parse_validate(ctx, param, value):
value_parse = []
# parsing elements that have multiple comma-separated values
for el in value:
value_parse += el.split(",")
if param.name == "select_fields":
if ctx.params["search_type"] == "dandisets":
choice_list = DANDISETS_FIELDS.keys()
elif ctx.params["search_type"] == "assets":
choice_list = ASSETS_FIELDS.keys()
else:
choice_list = None
else:
choice_list = None
# checking if all values are in the list of possible choices
for el in value_parse:
if choice_list and el not in choice_list:
ctx.fail(f"{el} is not in the list: {choice_list}")
return value_parse


@click.command(help="Search TODO")
@click.option(
"-F",
"--file",
help="Comma-separated list of fields to display. "
"An empty value to trigger a list of "
"available fields to be printed out",
)
@click.option(
"-t",
"--search_type",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here and everywhere below use --long-option, not --long_option as we do everywhere else (run git grep -A5 '@click.option')

help="Type of the search.",
type=click.Choice(["dandisets", "assets"]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

familiarize with #1357 and RF similarly here

)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FTR we have

download
  --download [assets,dandiset.yaml,all]
                                  Comma-separated list of elements to download
                                  [default: all]
ls
  --metadata [api|all|assets]

I wonder if we should unify somehow

@click.option(
"-s",
"--select_fields",
help="Field name for dandisets search",
callback=parse_validate,
multiple=True,
)
@click.option(
"-f",
"--filter_fields",
help="Field name for dandisets search",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

help is the same as for -s , so what is the difference?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also we have in ls

  -F, --fields TEXT               Comma-separated list of fields to display.
                                  An empty value to trigger a list of
                                  available fields to be printed out
  -f, --format [auto|pyout|json|json_pp|json_lines|yaml]
                                  Choose the format/frontend for output. If
                                  'auto', 'pyout' will be used in case of
                                  multiple files, and 'yaml' for a single
                                  file.

I think here we should be uniform with that -- with dandi/dandi-archive#1659 in mind, we do adhere to MVC pattern here too, and clearly separate out the model from rendering and thus have similar -f|--format option which might be just an output of those "search result records". Well -- you already have --format below, so we are on the right track.
May be here we should even consider RF/reusing ls code -- we just need a list of records to render. they might come from different search engines etc.

type=(str, str),
multiple=True,
)
@click.option(
"--format",
help="Choose the format for output. TODO",
type=click.Choice(["stdout", "csv"]),
default="stdout",
)
@click.option(
"--number_of_lines",
help="Number of lines of output that will be printed",
default=10,
)
@click.option(
"-d",
"--database_name",
help="Database name",
default="dandisets_new",
)
@map_to_click_exceptions
def search(
file=None,
search_type=None,
select_fields=None,
filter_fields=None,
format="stdout",
number_of_lines=10,
database_name="dandisets_new",
):

if file and search_type:
raise Exception("file and type are mutually exclusive options")

if file:
filepath = Path(file)
with filepath.open() as f:
query_str = f.read()
elif search_type in ["dandisets", "assets"]:
if not select_fields:
raise Exception(
f"select_fields is required if search type is {search_type}"
)
if filter_fields:
for el in filter_fields:
if el[0] not in select_fields:
raise Exception(
f"field {el[0]} used in filter_fields, "
f"but select fields contain {select_fields}"
)
if search_type == "dandisets":
query_str = create_dandisets_query(select_fields, filter_fields)
elif search_type == "assets":
query_str = create_assets_query(select_fields, filter_fields)
else:
raise NotImplementedError

endpoint = "https://search.dandiarchive.org:5820/dandisets_new/query"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would be just 1 type of search. We already have a few more (even if not exposed via API): https://llmsearch.dandiarchive.org/, https://dandiarchive.org/search etc.

I think we should generalize this dandi search [options] [query] interface into dandi search [common-options] HOW [how-specific-options-if-any] [query] or dandi search [options] [--how HOW] [query] interface, so we could expand etc.

sparql = SPARQLWrapper(endpoint)
sparql.setCredentials("anonymous", "anonymous")
sparql.setReturnFormat(JSON)
sparql.setQuery(query_str)
results = sparql.queryAndConvert()
res_df = results2df(results, number_of_lines)

if format == "stdout":
print(res_df)
else:
raise NotImplementedError("only stdout format implemented for now")


def results2df(results, limit=10):
res_lim = results["results"]["bindings"][:limit]
res_val_l = [dict((k, v["value"]) for k, v in res.items()) for res in res_lim]
return pd.DataFrame(res_val_l)


def filter_query(filter_fields, fields_dict):
"""creating filter part for the queries"""
filter_str = ""
for (key, val) in filter_fields:
if val[0] == "(" and val[-1] == ")":
val = val[1:-1].split(",")
if len(val) != 2:
raise ValueError(
"If value for filter is a tuple, it has to have 2 elements "
)
else:
min_val = val[0].strip()
max_val = val[1].strip()
if max_val and min_val:
filter_str += (
f"FILTER (?{fields_dict[key][0]} > {min_val} "
f"&& ?{fields_dict[key][0]} < {max_val}) \n"
)
elif max_val:
filter_str += f"FILTER (?{fields_dict[key][0]} < {max_val}) \n"
elif min_val:
filter_str += f"FILTER (?{fields_dict[key][0]} > {min_val}) \n"
else:
val = val.split(",")
cond_str = f'?{fields_dict[key][0]} = "{val[0]}"'
for el in val[1:]:
cond_str += f' || ?{fields_dict[key][0]} = "{el}"'
filter_str += f"FILTER ({cond_str}) \n"
return filter_str


def create_dandisets_query(select_fields, filter_fields):
"""Creating a query for dandisets search"""
var = ""
for el in select_fields:
var += f" ?{DANDISETS_FIELDS[el][0]}"

query_str = (
f"SELECT DISTINCT ?d{var} WHERE \n" "{ \n" " ?d dandi:assetsSummary ?as . \n"
)
for el in select_fields:
query_str += f" {DANDISETS_FIELDS[el][1]} \n"
query_str += filter_query(filter_fields, DANDISETS_FIELDS)
query_str += "}"
return query_str


def create_assets_query(select_fields, filter_fields):
"""Creating a query for assets search"""
var = ""
for el in select_fields:
var += f" ?{ASSETS_FIELDS[el][0]}"

query_str = (
f"SELECT DISTINCT ?asset ?d_id ?path{var} WHERE \n"
"{ \n"
" ?asset rdf:type dandi:Asset . \n"
" ?d prov:hasMember ?asset . \n"
" ?d schema:identifier ?d_id . \n"
" ?asset dandi:path ?path . \n"
)
for el in select_fields:
query_str += f" {ASSETS_FIELDS[el][1]} \n"
query_str += filter_query(filter_fields, ASSETS_FIELDS)
query_str += "}"
return query_str
2 changes: 2 additions & 0 deletions dandi/cli/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def main(ctx, log_level, pdb=False):
from .cmd_ls import ls # noqa: E402
from .cmd_move import move # noqa: E402
from .cmd_organize import organize # noqa: E402
from .cmd_search import search # noqa: E402
from .cmd_service_scripts import service_scripts # noqa: E402
from .cmd_shell_completion import shell_completion # noqa: E402
from .cmd_upload import upload # noqa: E402
Expand All @@ -158,6 +159,7 @@ def main(ctx, log_level, pdb=False):
ls,
move,
organize,
search,
service_scripts,
shell_completion,
upload,
Expand Down
Loading