diff --git a/.github/workflows/integration-testing.yml b/.github/workflows/integration-testing.yml new file mode 100644 index 0000000..a9accb2 --- /dev/null +++ b/.github/workflows/integration-testing.yml @@ -0,0 +1,58 @@ +name: Integration tests V4 + +on: + pull_request: + branches: + - main + +jobs: + container_job: + runs-on: ubuntu-latest + + services: + mongo: + image: mongo + ports: + - 27107:27107 + scicat-backend: + image: ghcr.io/scicatproject/scicat-backend-next:stable + ports: + - 3000:3000 + env: + MONGODB_URI: mongodb://mongo:27017/scicat + EXPRESS_SESSION_SECRET: "${EXPRESS_SESSION_SECRET}" + JWT_SECRET: "${JWT_SECRET}" + PORT: 3000 + HTTP_MAX_REDIRECTS: 5 + HTTP_TIMEOUT: 5000 + JWT_EXPIRES_IN: 3600 + SITE: SAMPLE-SITE + PID_PREFIX: PID.SAMPLE.PREFIX + DOI_PREFIX: DOI.SAMPLE.PREFIX + METADATA_KEYS_RETURN_LIMIT: 100 + METADATA_PARENT_INSTANCES_RETURN_LIMIT: 100 + ADMIN_GROUPS: admin,ingestor + + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - run: source continuous_integration/scripts/install.sh + + + - run: | + set -vxeuo pipefail + python -m pip install . + python -m pip install .[dev] + python -m pip list + + - run: | + set -vxeuo pipefail + coverage run -m pytest tests/tests_integration/tests_integration.py + coverage report + env: + BASE_URL: http://localhost:3000/api/v3 + SCICAT_USER: ingestor + SCICAT_PASSWORD: aman \ No newline at end of file diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml deleted file mode 100644 index 392e281..0000000 --- a/.github/workflows/linting.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: pre-commit - -on: - pull_request: - push: - branches: [main] - -jobs: - pre-commit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: pre-commit/action@v2.0.3 \ No newline at end of file diff --git a/.github/workflows/publish-documentation.yml b/.github/workflows/publish-documentation.yml index 78f4544..5a1571f 100644 --- a/.github/workflows/publish-documentation.yml +++ b/.github/workflows/publish-documentation.yml @@ -37,7 +37,7 @@ jobs: shell: bash -l {0} run: | set -vxeuo pipefail - python -m pip install -r requirements-dev.txt + python -m pip install .[dev] python -m pip list - name: Build Docs shell: bash -l {0} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 5b95152..b71b1be 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -7,6 +7,7 @@ on: - cron: '00 4 * * *' # daily at 4AM jobs: + build: runs-on: ubuntu-latest @@ -31,11 +32,19 @@ jobs: shell: bash -l {0} run: | set -vxeuo pipefail - python -m pip install -r requirements-dev.txt + python -m pip install .[dev] + python -m pip install .[hdf5] python -m pip list + + - name: Lint with flake8 + shell: bash -l {0} + run: | + set -vxeuo pipefail + python -m flake8 + - name: Test with pytest shell: bash -l {0} run: | set -vxeuo pipefail - coverage run -m pytest -v + coverage run -m pytest --ignore tests_integration -v coverage report diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 0baa145..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -default_language_version: - python: python3 -repos: - - repo: https://github.com/ambv/black - rev: 21.12b0 - hooks: - - id: black - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.0.0 - hooks: - - id: flake8 - - repo: https://github.com/kynan/nbstripout - rev: 0.5.0 - hooks: - - id: nbstripout diff --git a/docs/pyscicatlogo.png b/docs/pyscicatlogo.png new file mode 100644 index 0000000..d6b31af Binary files /dev/null and b/docs/pyscicatlogo.png differ diff --git a/docs/pyscicatlogo.svg b/docs/pyscicatlogo.svg new file mode 100644 index 0000000..30f3c3a --- /dev/null +++ b/docs/pyscicatlogo.svg @@ -0,0 +1,164 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/conf.py b/docs/source/conf.py index 48a8171..28b3c36 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -90,7 +90,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/source/howto/ingest.md b/docs/source/howto/ingest.md index 05cf37d..a694ed0 100644 --- a/docs/source/howto/ingest.md +++ b/docs/source/howto/ingest.md @@ -13,6 +13,7 @@ from pyscicat.model import ( Datablock, DataFile, Dataset, + Sample, Ownable ) @@ -61,6 +62,19 @@ Note that we store the provided dataset_id in a variable for later use. Also note the `sourceFolder`. This is a folder on the file system that SciCat has access to, and will contain the files for this `Dataset`. +Proposals and instruments have to be created by an administrator. A sample with `sampleId="gargleblaster"` can be created like this: +```python +sample = Sample( + sampleId="gargleblaster", + owner="Chamber of Commerce", + description="A legendary drink.", + sampleCharacteristics={"Flavour": "Unknown, but potent"}, + isPublished=False, + **ownable.dict() +) +sample_id = client.upload_sample(sample) # sample_id == "gargleblaster" +``` + ## Upload a Datablock ```python diff --git a/docs/source/howto/ingestion_simulation_dataset_ess.md b/docs/source/howto/ingestion_simulation_dataset_ess.md new file mode 100644 index 0000000..5491b60 --- /dev/null +++ b/docs/source/howto/ingestion_simulation_dataset_ess.md @@ -0,0 +1,248 @@ +# Ingest Simulation Dataset at ESS +In the process of designing and commissioning of the ESS, many simulation datasets have been produced in the process of finding the best design and validate them. +At ESS, we have decided to import such datsets in to our SciCat instance to facilitate search, assess quickly the comulative quality of the collected results and be able to start applying Machine Learning techniques to such data in the near future. + +## Background +Data scientist and modeller at ESS have produced many simulations each one including multiple variations of the same design running parameters exploration. +The process of ingesting all this information into SciCat will produce around a thousands new datasets. +To facilitate testing and validation of all the information at each step of the process, data curators have decided to break down the process in multiple scripts which comulative collect all the information needed to create a meaningful entry in SciCat. +The process produces one json file containing the basic information, metadata and files associated with one datasets. +The last step is to read such file and inges it into SciCat. +The rest of this document covers all the code used to load the dataset information, create the matching models and create a new dataset and orig datablock in SciCat. + +## Individual Dataset entry +Each dataset is prepared for ingestion and save in an individual json file. +The example json file is available under the example/data folder and has the following structure: + +```json +{ + "id": "0275d813-be6b-444f-812f-b8311d129361", + "dataset": { + "datasetName": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", + "description": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", + "principalInvestigator": "Max Novelli", + "creationLocation": "DMSC", + "owner": "Massimiliano Novelli", + "ownerEmail": "max.novelli@ess.eu", + "contactEmail": "max.novelli@ess.eu", + "sourceFolder": "/mnt/data/simulation/CAMEA/CAMEA31", + "creationTime": "2022-03-07T15:44:59.000Z", + "type": "raw", + "techniques": [ + { + "pid": "fe888574-5cc0-11ec-90c3-bf82943dec35", + "name": "Simulation" + } + ], + "size": 68386784, + "instrumentId": "", + "sampleId": "", + "proposalId": "", + "scientificMetadata": { + "sample_width": { + "value": 0.015, + "unit": "m" + }, + "sample_height": { + "value": 0.015, + "unit": "m" + }, + "divergence_requirement_horizontal": { + "value": 0.75, + "unit": "deg" + }, + "omissed" : { + "notes" : "Additional scientific metadata has been omitted for readability" + } + } + }, + "orig_datablock": { + "size": 68386784, + "ownerGroup": "ess", + "accessGroups": ["dmsc", "swap"], + "dataFileList": [ + { + "path": "launch_all.sh", + "size": 10171, + "time": "2014-01-23T19:52:37.000Z" + }, { + "path": "suggested_reruns-fails.sh", + "size": 448, + "time": "2014-01-23T19:53:04.000Z" + }, { + "notes" : "Additional files entries has been omitted for readability" + } + ] + }, + "ownable": { + "ownerGroup": "ess", + "accessGroups": ["dmsc"] + } +} + +``` +As you can see, the file has already been structure with the three main component of the dataset: +- the main dataset body with scientifica metadata +- the ownable object +- the orig datablock containing all the files tassociated with the dataset + +The three sections allows for an easier ingestion code + +## Script +The script to ingest the dataset mentioned above is available in the exampe folder with the name of `ingestion_simulation_dataset_ess.py`. +In this section, we are going to walk through the code of this script to illustrate the various functionalities. + + +### Overall decription +The ingestion is organized in simple sections by leveraging the dataset information which is already optimally optimized to peerform the operations required to create a full dataset in SciCat. +In order to simplify the script, it is assumed that pyscicat is installed system wide and the script is run from the folder where is saved. All the file paths are relative to the script folder. +At the beginning of the script, libraries are imported and we define paths to the relevant json files. + +```python +# libraries +import json +import pyscicat.client as pyScClient +import pyscicat.model as pyScModel + + +# scicat configuration file +# includes scicat instance URL +# scicat user and password +scicat_configuration_file = "./data/ingestion_simulation_dataset_ess_config.json" +simulation_dataset_file = "./data/ingestion_simulation_dataset_ess_dataset.json" +``` + + +### Loading relevant information +In the next section, the script loads the configuration needed to communicate with SciCat and the dataset information + +```python +# loads scicat configuration +with open(scicat_configuration_file,"r") as fh: + scicat_config = json.load(fh) + + +# loads simulation information from matching json file +with open(simulation_dataset_file,"r") as fh: + dataset_information = json.load(fh) +``` + + +### Authentication +Here, we instantiate the pyscicat object and perform the login. + +```python +scClient = pyScClient.ScicatClient( + base_url=scicat_config['scicat']['host'], + username=scicat_config['scicat']['username'], + password=scicat_config['scicat']['password'] +) +``` + + +### Create Ownable model +We, than, instantiate the ownable object, which is used in assign the correct owner and access to all the other SciCat entries that we are going to create. + +```python +ownable = pyScModel.Ownable( + **dataset_information['ownable'] +) +``` + +This notiation is equivalent to pass in all the ownable object properties explicitly. +```python +ownable = pyScModel.Ownable( + ownerGroup=dataset_information['ownable']['ownergroup'], + accessGroups=dataset_information['ownable']['accessGroups'] +) +``` + + +### Create Dataset model +Next step, we need to instantiate a raw dataset object defined in pySciCat models. +Make sure to select the correct dataset: raw or derived. In our case, we are creating a raw one, which is specified in the dataset json file +```python +dataset = pyScModel.RawDataset( + **dataset_information['dataset'], + **ownable.dict() +) +``` + +As highlighted in the previous section, this notation is equivalent to assign all the model properties explicitly: +```python +dataset = pyScModel.RawDataset( + datasetName=dataset_information['dataset']['datasetName'], + description=dataset_information['dataset']['description'], + creationLocation=dataset_information['dataset']['creationLocation'], + principalInvestigator=dataset_information['dataset']['principalInvestigator'], + owner=dataset_information['dataset']['owner'], + ownerEmail=dataset_information['dataset']['ownerEmail'], + ... omitted ... + ownerGroup=dataset_information['ownable']['ownergroup'], + accessGroups=dataset_information['ownable']['accessGroups'] +) +``` + + +### Submit Dataset to SciCat +We are now ready to make a post to SciCat and create a Dataset + +```python +created_dataset = scClient.upload_new_dataset(dataset) +``` + +If the request is successful, the variable created_dataset should return the same information present in dataset with the additionl field named _pid_ which cotnains the official pid assigned to this dataset by SciCat + + +### Create OrigDatablock model +Now that we have created the dataset, we will add the list of files related to this dataset. +As we have done with the other objects, we leverage the pySciCat model to make sure that the information is properly validated. +In this snippet of code, we use explicit notation for the main object, and we use the expansion for the inner file model. + +```python +origDataBlock = pyScModel.OrigDatablock( + size=dataset_information['orig_datablock']['size'], + datasetId=created_dataset['pid'], + dataFileList=[ + pyScModel.DataFile( + **file + ) + for file + in dataset_information['orig_datablock']['dataFileList'] + ], + **ownable.dict() +) +``` + +As highlighted before, this code is equivalent to: +```python +origDataBlock = pyScModel.OrigDatablock( + size=dataset_information['orig_datablock']['size'], + datasetId=created_dataset['pid'], + dataFileList=[ + pyScModel.DataFile( + path=file['path',] + size=file['size'], + time=file['time'] + ) + for file + in dataset_information['orig_datablock']['dataFileList'] + ], + ownerGroup=dataset_information['ownable']['ownergroup'], + accessGroups=dataset_information['ownable']['accessGroups'] +) +``` + +### Submit OrigDatablock +With the original datablock object created, it is time to submit th erequest to SciCat. + +```python +created_orig_datablock = scClient.upload_dataset_origdatablock(origDataBlock) +``` + +Similarly to the dataset creation function, this call will return the same information provided as argument, with the addition of the pid assigned to the entry by SciCat + + +## Validate the dataset +At this point, you can visit your instance of SciCat and you should see the dataset that we just created in the list of datasets. The file list can be viewed visiting the _Datafiles_ tab on the dataset details page + diff --git a/docs/source/index.md b/docs/source/index.md index c32ff5a..1fae2e9 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -17,6 +17,7 @@ installation ```{toctree} :caption: How To Guides howto/ingest +howto/ingestion_simulation_dataset_ess ``` diff --git a/examples/data/ingestion_simulation_dataset_ess_config.json b/examples/data/ingestion_simulation_dataset_ess_config.json new file mode 100644 index 0000000..6b49bdd --- /dev/null +++ b/examples/data/ingestion_simulation_dataset_ess_config.json @@ -0,0 +1,7 @@ +{ + "scicat" : { + "host": "", + "username": "ingestor", + "password": "" + } +} diff --git a/examples/data/ingestion_simulation_dataset_ess_derived_dataset.json b/examples/data/ingestion_simulation_dataset_ess_derived_dataset.json new file mode 100644 index 0000000..53c18b4 --- /dev/null +++ b/examples/data/ingestion_simulation_dataset_ess_derived_dataset.json @@ -0,0 +1,256 @@ +{ + "id": "9be3bd96-e256-11ec-bd08-f32122965a87", + "dataset": { + "datasetName": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE derived", + "description": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", + "investigator": "Max Novelli", + "inputDatasets" : ["0275d813-be6b-444f-812f-b8311d129361"], + "usedSoftware" : ["python","My software"], + "jobParameters" : { + "parameter-1" : "value-1", + "parameter-2" : "value-2" + }, + "jobLogData" : "Some jebrish about the dataset", + "owner": "Massimiliano Novelli", + "ownerEmail": "max.novelli@ess.eu", + "contactEmail": "max.novelli@ess.eu", + "sourceFolder": "/mnt/data/simulation/CAMEA/CAMEA31", + "creationTime": "2022-03-07T15:44:59.000Z", + "type": "derived", + "scientificMetadata": { + "sample_width": { "value": 0.015, "unit": "m" }, + "sample_height": { "value": 0.015, "unit": "m" }, + "divergence_requirement_horizontal": { "value": 0.75, "unit": "deg" }, + "divergence_requirement_vertical": { "value": 1, "unit": "deg" }, + "guide_sample_distance": { "value": 0.6, "unit": "m" }, + "lower_wavelength_limit": { "value": 1, "unit": "\u00c5" }, + "upper_wavelength_limit": { "value": 3.6, "unit": "\u00c5" }, + "moderator_width": { "value": 0.12, "unit": "m" }, + "moderator_height": { "value": 0.03, "unit": "m" }, + "moderator_sample_distance": { "value": 170, "unit": "m" }, + "parsing_variables": { "value": "guide_start , startx1 , starty1 , length1", "unit": "" }, + "parsing_min_guide_start": { "value": 2.000035881054106, "unit": "m" }, + "parsing_max_guide_start": { "value": 5.407538318585075, "unit": "m" }, + "parsing_mean_guide_start": { "value": 2.3475508029429557, "unit": "m" }, + "parsing_std_guide_start": { "value": 0.5522363822422368, "unit": "m" }, + "parsing_min_startx1": { "value": 0.006706596967962139, "unit": "m" }, + "parsing_max_startx1": { "value": 0.1460959338571846, "unit": "m" }, + "parsing_mean_startx1": { "value": 0.08885675463366878, "unit": "m" }, + "parsing_std_startx1": { "value": 0.017699812942929365, "unit": "m" }, + "parsing_min_starty1": { "value": 0.011762187831963904, "unit": "m" }, + "parsing_max_starty1": { "value": 0.14999127413576652, "unit": "m" }, + "parsing_mean_starty1": { "value": 0.13009670276273638, "unit": "m" }, + "parsing_std_starty1": { "value": 0.011522927034872269, "unit": "m" }, + "parsing_min_length1": { "value": 28.915197821153896, "unit": "" }, + "parsing_max_length1": { "value": 95.07944574028325, "unit": "" }, + "parsing_mean_length1": { "value": 64.23126877070395, "unit": "" }, + "parsing_std_length1": { "value": 10.210341803833671, "unit": "" }, + "optimization_name": { "value": "PGESKSE", "unit": "" }, + "configuration_summary": { "value": "PGESKSE", "unit": "" }, + "best_figure_of_merit": { "value": "0.25293", "unit": "" }, + "brilliance_transfer": { "value": "0.47344", "unit": "" }, + "event_file_name_suffix": { "value": "4Hsize_3moderator_size_y", "unit": "" }, + "number_of_parameters": { "value": 2, "unit": "" }, + "parameters_name": { "value": "Hsize , moderator_size_y", "unit": "" }, + "event_writen_present": { "value": true, "unit": "" }, + "event_writen_file": { "value": "master_record-writen_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_writen_timestamp": { "value": "2014-01-23T19:52:38", "unit": "" }, + "event_done_present": { "value": true, "unit": "" }, + "event_done_file": { "value": "master_record-done_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_done_timestamp": { "value": "2014-01-25T00:35:55", "unit": "" }, + "event_analysis_present": { "value": true, "unit": "" }, + "event_analysis_file": { "value": "output/analysis/master_record-analyzed_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_analysis_timestamp": { "value": "2014-01-28T14:03:02", "unit": "" }, + "dataset_name": { "value": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", "unit": "" }, + "run_name": { "value": "CAMEA CAMEA31", "unit": "" }, + "scan_name": { "value": "4Hsize_3moderator_size_y", "unit": "" }, + "output_file_name_base": { "value": "PGESKSE_4Hsize_3moderator_size_y", "unit": "" }, + "dataset_access_path": { "value": "/mnt/data/simulation/CAMEA/CAMEA31", "unit": "" }, + "parameters_structure": { "value": "[{\"name\": \"Hsize\", \"value\": \"1.5\", \"index\": \"4\"}, {\"name\": \"moderator_size_y\", \"value\": \"0.03\", \"index\": \"3\"}]", "unit": "" }, + "Hsize": { "value": 4, "unit": "cm" }, + "moderator_size_y": { "value": 3, "unit": "m" } + }, + "techniques": [ + { + "pid": "fe888574-5cc0-11ec-90c3-bf82943dec35", + "name": "Simulation" + } + ], + "size": 68386784, + "instrumentId": "" + }, + "orig_datablock": { + "size": 68386784, + "dataFileList": [ + { + "path": "launch_all.sh", + "size": 10171, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "suggested_reruns-fails.sh", + "size": 448, + "time": "2014-01-23T19:53:04.000Z" + }, + { + "path": "compile_all_py.sh", + "size": 273, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "clean3.sh", + "size": 354, + "time": "2014-01-25T10:44:54.000Z" + }, + { + "path": "master_record-done_4Hsize_3moderator_size_y.txt", + "size": 579, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "master_record-writen_4Hsize_3moderator_size_y.txt", + "size": 561, + "time": "2014-01-23T19:52:38.000Z" + }, + { + "path": "compile_all.sh", + "size": 259, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "output/brill_ref/brilliance_ref_4Hsize_3moderator_size_y.mat", + "size": 11624010, + "time": "2014-01-24T07:56:45.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_acceptance_ess.png", + "size": 521132, + "time": "2014-01-27T11:38:06.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_acceptance_pure.png", + "size": 518423, + "time": "2014-01-27T11:37:52.000Z" + }, + { + "path": "output/analysis/master_record-analyzed_4Hsize_3moderator_size_y.txt", + "size": 587, + "time": "2014-01-28T14:03:02.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_overall_pure.png", + "size": 144605, + "time": "2014-01-27T11:37:49.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_posdiv_ess.png", + "size": 336496, + "time": "2014-01-27T11:38:04.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y_all.mat", + "size": 34321077, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_overall_ess.png", + "size": 127660, + "time": "2014-01-27T11:38:02.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_geometry.dat", + "size": 2175, + "time": "2014-01-25T00:23:10.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y_ifit_analyse.m", + "size": 19482, + "time": "2014-01-23T19:52:40.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_geometry.png", + "size": 76259, + "time": "2014-01-27T11:38:09.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_posdiv_pure.png", + "size": 353828, + "time": "2014-01-27T11:37:50.000Z" + }, + { + "path": "brilliance_refference/brilliance_ifit_4Hsize_3moderator_size_y.m", + "size": 3048, + "time": "2014-01-23T19:52:33.000Z" + }, + { + "path": "brilliance_refference/brilliance_4Hsize_3moderator_size_y1.mat", + "size": 11626979, + "time": "2014-01-24T07:56:42.000Z" + }, + { + "path": "brilliance_refference/brilliance_4Hsize_3moderator_size_y.batch", + "size": 671, + "time": "2014-01-23T19:52:32.000Z" + }, + { + "path": "brilliance_refference/input_used_4Hsize_3moderator_size_y.txt", + "size": 358, + "time": "2014-01-23T19:52:35.000Z" + }, + { + "path": "brilliance_refference/run_brilliance_ifit_4Hsize_3moderator_size_y.m", + "size": 53, + "time": "2014-01-23T19:52:36.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y.batch", + "size": 734, + "time": "2014-01-23T19:52:48.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y_ifit.m", + "size": 11101, + "time": "2014-01-23T19:52:48.000Z" + }, + { + "path": "PGESKSE/err_PGESKSE_4Hsize_3moderator_size_y.txt", + "size": 0, + "time": "2014-01-24T21:13:29.000Z" + }, + { + "path": "PGESKSE/run_PGESKSE_4Hsize_3moderator_size_y_ifit.m", + "size": 50, + "time": "2014-01-23T19:52:51.000Z" + }, + { + "path": "PGESKSE/out_PGESKSE_4Hsize_3moderator_size_y.txt", + "size": 8681220, + "time": "2014-01-25T00:35:58.000Z" + }, + { + "path": "PGESKSE/compile_PGESKSE_py.sh", + "size": 558, + "time": "2014-01-23T19:52:45.000Z" + }, + { + "path": "PGESKSE/compile_PGESKSE.sh", + "size": 540, + "time": "2014-01-23T19:52:45.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y1.par", + "size": 918, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y1_geometry.dat", + "size": 2175, + "time": "2014-01-25T00:23:10.000Z" + } + ] + }, + "ownable": { + "ownerGroup": "ess", + "accessGroups": ["dmsc"] + } +} diff --git a/examples/data/ingestion_simulation_dataset_ess_raw_dataset.json b/examples/data/ingestion_simulation_dataset_ess_raw_dataset.json new file mode 100644 index 0000000..c2b5817 --- /dev/null +++ b/examples/data/ingestion_simulation_dataset_ess_raw_dataset.json @@ -0,0 +1,252 @@ +{ + "id": "0275d813-be6b-444f-812f-b8311d129361", + "dataset": { + "datasetName": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE raw", + "description": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", + "creationLocation": "DMSC", + "principalInvestigator": "Max Novelli", + "owner": "Massimiliano Novelli", + "ownerEmail": "max.novelli@ess.eu", + "contactEmail": "max.novelli@ess.eu", + "sourceFolder": "/mnt/data/simulation/CAMEA/CAMEA31", + "creationTime": "2022-03-07T15:44:59.000Z", + "type": "raw", + "scientificMetadata": { + "sample_width": { "value": 0.015, "unit": "m" }, + "sample_height": { "value": 0.015, "unit": "m" }, + "divergence_requirement_horizontal": { "value": 0.75, "unit": "deg" }, + "divergence_requirement_vertical": { "value": 1, "unit": "deg" }, + "guide_sample_distance": { "value": 0.6, "unit": "m" }, + "lower_wavelength_limit": { "value": 1, "unit": "\u00c5" }, + "upper_wavelength_limit": { "value": 3.6, "unit": "\u00c5" }, + "moderator_width": { "value": 0.12, "unit": "m" }, + "moderator_height": { "value": 0.03, "unit": "m" }, + "moderator_sample_distance": { "value": 170, "unit": "m" }, + "parsing_variables": { "value": "guide_start , startx1 , starty1 , length1", "unit": "" }, + "parsing_min_guide_start": { "value": 2.000035881054106, "unit": "m" }, + "parsing_max_guide_start": { "value": 5.407538318585075, "unit": "m" }, + "parsing_mean_guide_start": { "value": 2.3475508029429557, "unit": "m" }, + "parsing_std_guide_start": { "value": 0.5522363822422368, "unit": "m" }, + "parsing_min_startx1": { "value": 0.006706596967962139, "unit": "m" }, + "parsing_max_startx1": { "value": 0.1460959338571846, "unit": "m" }, + "parsing_mean_startx1": { "value": 0.08885675463366878, "unit": "m" }, + "parsing_std_startx1": { "value": 0.017699812942929365, "unit": "m" }, + "parsing_min_starty1": { "value": 0.011762187831963904, "unit": "m" }, + "parsing_max_starty1": { "value": 0.14999127413576652, "unit": "m" }, + "parsing_mean_starty1": { "value": 0.13009670276273638, "unit": "m" }, + "parsing_std_starty1": { "value": 0.011522927034872269, "unit": "m" }, + "parsing_min_length1": { "value": 28.915197821153896, "unit": "" }, + "parsing_max_length1": { "value": 95.07944574028325, "unit": "" }, + "parsing_mean_length1": { "value": 64.23126877070395, "unit": "" }, + "parsing_std_length1": { "value": 10.210341803833671, "unit": "" }, + "optimization_name": { "value": "PGESKSE", "unit": "" }, + "configuration_summary": { "value": "PGESKSE", "unit": "" }, + "best_figure_of_merit": { "value": "0.25293", "unit": "" }, + "brilliance_transfer": { "value": "0.47344", "unit": "" }, + "event_file_name_suffix": { "value": "4Hsize_3moderator_size_y", "unit": "" }, + "number_of_parameters": { "value": 2, "unit": "" }, + "parameters_name": { "value": "Hsize , moderator_size_y", "unit": "" }, + "event_writen_present": { "value": true, "unit": "" }, + "event_writen_file": { "value": "master_record-writen_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_writen_timestamp": { "value": "2014-01-23T19:52:38", "unit": "" }, + "event_done_present": { "value": true, "unit": "" }, + "event_done_file": { "value": "master_record-done_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_done_timestamp": { "value": "2014-01-25T00:35:55", "unit": "" }, + "event_analysis_present": { "value": true, "unit": "" }, + "event_analysis_file": { "value": "output/analysis/master_record-analyzed_4Hsize_3moderator_size_y.txt", "unit": "" }, + "event_analysis_timestamp": { "value": "2014-01-28T14:03:02", "unit": "" }, + "dataset_name": { "value": "CAMEA CAMEA31 Hsize 4 moderator_size_y 3 PGESKSE", "unit": "" }, + "run_name": { "value": "CAMEA CAMEA31", "unit": "" }, + "scan_name": { "value": "4Hsize_3moderator_size_y", "unit": "" }, + "output_file_name_base": { "value": "PGESKSE_4Hsize_3moderator_size_y", "unit": "" }, + "dataset_access_path": { "value": "/mnt/data/simulation/CAMEA/CAMEA31", "unit": "" }, + "parameters_structure": { "value": "[{\"name\": \"Hsize\", \"value\": \"1.5\", \"index\": \"4\"}, {\"name\": \"moderator_size_y\", \"value\": \"0.03\", \"index\": \"3\"}]", "unit": "" }, + "Hsize": { "value": 4, "unit": "cm" }, + "moderator_size_y": { "value": 3, "unit": "m" } + }, + "techniques": [ + { + "pid": "fe888574-5cc0-11ec-90c3-bf82943dec35", + "name": "Simulation" + } + ], + "size": 68386784, + "instrumentId": "", + "sampleId": "", + "proposalId": "" + }, + "orig_datablock": { + "size": 68386784, + "dataFileList": [ + { + "path": "launch_all.sh", + "size": 10171, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "suggested_reruns-fails.sh", + "size": 448, + "time": "2014-01-23T19:53:04.000Z" + }, + { + "path": "compile_all_py.sh", + "size": 273, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "clean3.sh", + "size": 354, + "time": "2014-01-25T10:44:54.000Z" + }, + { + "path": "master_record-done_4Hsize_3moderator_size_y.txt", + "size": 579, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "master_record-writen_4Hsize_3moderator_size_y.txt", + "size": 561, + "time": "2014-01-23T19:52:38.000Z" + }, + { + "path": "compile_all.sh", + "size": 259, + "time": "2014-01-23T19:52:37.000Z" + }, + { + "path": "output/brill_ref/brilliance_ref_4Hsize_3moderator_size_y.mat", + "size": 11624010, + "time": "2014-01-24T07:56:45.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_acceptance_ess.png", + "size": 521132, + "time": "2014-01-27T11:38:06.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_acceptance_pure.png", + "size": 518423, + "time": "2014-01-27T11:37:52.000Z" + }, + { + "path": "output/analysis/master_record-analyzed_4Hsize_3moderator_size_y.txt", + "size": 587, + "time": "2014-01-28T14:03:02.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_overall_pure.png", + "size": 144605, + "time": "2014-01-27T11:37:49.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_posdiv_ess.png", + "size": 336496, + "time": "2014-01-27T11:38:04.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y_all.mat", + "size": 34321077, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_overall_ess.png", + "size": 127660, + "time": "2014-01-27T11:38:02.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_geometry.dat", + "size": 2175, + "time": "2014-01-25T00:23:10.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y_ifit_analyse.m", + "size": 19482, + "time": "2014-01-23T19:52:40.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_geometry.png", + "size": 76259, + "time": "2014-01-27T11:38:09.000Z" + }, + { + "path": "output/analysis/PGESKSE_4Hsize_3moderator_size_y1_posdiv_pure.png", + "size": 353828, + "time": "2014-01-27T11:37:50.000Z" + }, + { + "path": "brilliance_refference/brilliance_ifit_4Hsize_3moderator_size_y.m", + "size": 3048, + "time": "2014-01-23T19:52:33.000Z" + }, + { + "path": "brilliance_refference/brilliance_4Hsize_3moderator_size_y1.mat", + "size": 11626979, + "time": "2014-01-24T07:56:42.000Z" + }, + { + "path": "brilliance_refference/brilliance_4Hsize_3moderator_size_y.batch", + "size": 671, + "time": "2014-01-23T19:52:32.000Z" + }, + { + "path": "brilliance_refference/input_used_4Hsize_3moderator_size_y.txt", + "size": 358, + "time": "2014-01-23T19:52:35.000Z" + }, + { + "path": "brilliance_refference/run_brilliance_ifit_4Hsize_3moderator_size_y.m", + "size": 53, + "time": "2014-01-23T19:52:36.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y.batch", + "size": 734, + "time": "2014-01-23T19:52:48.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y_ifit.m", + "size": 11101, + "time": "2014-01-23T19:52:48.000Z" + }, + { + "path": "PGESKSE/err_PGESKSE_4Hsize_3moderator_size_y.txt", + "size": 0, + "time": "2014-01-24T21:13:29.000Z" + }, + { + "path": "PGESKSE/run_PGESKSE_4Hsize_3moderator_size_y_ifit.m", + "size": 50, + "time": "2014-01-23T19:52:51.000Z" + }, + { + "path": "PGESKSE/out_PGESKSE_4Hsize_3moderator_size_y.txt", + "size": 8681220, + "time": "2014-01-25T00:35:58.000Z" + }, + { + "path": "PGESKSE/compile_PGESKSE_py.sh", + "size": 558, + "time": "2014-01-23T19:52:45.000Z" + }, + { + "path": "PGESKSE/compile_PGESKSE.sh", + "size": 540, + "time": "2014-01-23T19:52:45.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y1.par", + "size": 918, + "time": "2014-01-25T00:35:55.000Z" + }, + { + "path": "PGESKSE/PGESKSE_4Hsize_3moderator_size_y1_geometry.dat", + "size": 2175, + "time": "2014-01-25T00:23:10.000Z" + } + ] + }, + "ownable": { + "ownerGroup": "ess", + "accessGroups": ["dmsc"] + } +} diff --git a/examples/data/published_data.json b/examples/data/published_data.json new file mode 100644 index 0000000..54a573f --- /dev/null +++ b/examples/data/published_data.json @@ -0,0 +1,56 @@ +[ + { + "doi": "10.17199/03dd9804-1b04-4d36-b0fb-cf66e9891e7d", + "affiliation": "ESS", + "creator": [ + "Oliver Lohmann" + ], + "publisher": "ESS", + "publicationYear": 2019, + "title": "SANS/Reflectometry", + "url": "", + "abstract": "SANS/Reflectometry", + "dataDescription": "https://github.com/ess-dmsc/ess_file_formats/wiki/NeXus", + "resourceType": "NeXus HDF5", + "numberOfFiles": null, + "sizeOfArchive": null, + "pidArray": [ + "20.500.12269/0a269002-83e2-4f18-bb98-36c01836d66a" + ], + "authors": [ + "Oliver Lohmann" + ], + "registeredTime": "2020-09-01T14:16:15.552Z", + "status": "registered", + "thumbnail": "", + "createdBy": "admin", + "updatedBy": "admin", + "createdAt": "2020-01-03T19:38:34.203Z", + "updatedAt": "2020-09-09T09:37:58.023Z" + }, + { + "doi": "10.17199/165f8a52-c15d-4c96-ad7d-fb0cbe969f66", + "creator": [ + "Peter Kadletz" + ], + "publisher": "ESS", + "publicationYear": 2020, + "title": "Final bte", + "url": "", + "abstract": "Peter Kadletz, Tobias Richter", + "dataDescription": "https://github.com/ess-dmsc/ess_file_formats/wiki/NeXus", + "resourceType": "raw", + "numberOfFiles": null, + "sizeOfArchive": null, + "pidArray": [ + "20.500.12269/2511nicos_00002511.hdf" + ], + "registeredTime": "2020-09-01T14:16:17.272Z", + "status": "registered", + "scicatUser": "ingestor", + "thumbnail": "", + "updatedBy": "admin", + "createdAt": "2022-06-03T11:16:09.681Z", + "updatedAt": "2020-09-09T09:37:58.094Z" + } +] diff --git a/examples/ingestion_simulation_dataset_ess.py b/examples/ingestion_simulation_dataset_ess.py new file mode 100644 index 0000000..96c57c6 --- /dev/null +++ b/examples/ingestion_simulation_dataset_ess.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ingestion_simulation_dataset_ess +# +# Ingest the example simulation dataset in the specified scicat instance +# This script is provided as is, and as an example in pyScicat documentation +# +# +# Create by: Max Novelli +# max.novelli@ess.eu +# European Spallation Source ERIC, +# P.O. Box 176, +# SE-221 00, Lund, Sweden +# +# + + +# libraries +import json +import pyscicat.client as pyScClient +import pyscicat.model as pyScModel + + +# scicat configuration file +# includes scicat instance URL +# scicat user and password +scicat_configuration_file = "./data/ingestion_simulation_dataset_ess_config.json" +simulation_dataset_file = "./data/ingestion_simulation_dataset_ess.json" + + +# loads scicat configuration +with open(scicat_configuration_file, "r") as fh: + scicat_config = json.load(fh) + + +# loads simulation information from matching json file +with open(simulation_dataset_file, "r") as fh: + dataset_information = json.load(fh) + +# instantiate a pySciCat client +scClient = pyScClient.ScicatClient( + base_url=scicat_config["scicat"]["host"], + username=scicat_config["scicat"]["username"], + password=scicat_config["scicat"]["password"], +) + +# create an owneable object to be used with all the other models +# all the fields are retrieved directly from the simulation information +ownable = pyScModel.Ownable(**dataset_information["ownable"]) + + +# create dataset object from the pyscicat model +# includes ownable from previous step +dataset = pyScModel.RawDataset(**dataset_information["dataset"], **ownable.dict()) + + +# create dataset entry in scicat +# it returns the full dataset information, including the dataset pid assigned automatically by scicat +created_dataset = scClient.upload_new_dataset(dataset) + + +# create origdatablock object from pyscicat model +origDataBlock = pyScModel.OrigDatablock( + size=dataset_information["orig_datablock"]["size"], + datasetId=created_dataset["pid"], + dataFileList=[ + pyScModel.DataFile(**file) + for file in dataset_information["orig_datablock"]["dataFileList"] + ], + **ownable.dict() +) + +# create origDatablock associated with dataset in SciCat +# it returns the full object including SciCat id assigned when created +created_orig_datablock = scClient.upload_dataset_origdatablock(origDataBlock) diff --git a/pyscicat/_version.py b/pyscicat/_version.py index 6977658..bae1847 100644 --- a/pyscicat/_version.py +++ b/pyscicat/_version.py @@ -1,11 +1,13 @@ + # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.18 (https://github.com/warner/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.28 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" @@ -14,6 +16,8 @@ import re import subprocess import sys +from typing import Callable, Dict +import functools def get_keywords(): @@ -51,40 +55,44 @@ class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" -LONG_VERSION_PY = {} -HANDLERS = {} +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - + """Create decorator to mark a method as the handler of a VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f - return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): """Call the given command(s).""" assert isinstance(commands, list) - p = None - for c in commands: + process = None + + popen_kwargs = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: try: - dispcmd = str([c] + args) + dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen( - [c] + args, - cwd=cwd, - env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr else None), - ) + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None), **popen_kwargs) break - except EnvironmentError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue @@ -96,15 +104,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= if verbose: print("unable to find command, tried %s" % (commands,)) return None, None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) - return None, p.returncode - return stdout, p.returncode + return None, process.returncode + return stdout, process.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): @@ -116,25 +122,18 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): """ rootdirs = [] - for i in range(3): + for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - "date": None, - } - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -147,22 +146,21 @@ def git_get_keywords(versionfile_abs): # _version.py. keywords = {} try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() - except EnvironmentError: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: pass return keywords @@ -170,10 +168,14 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because @@ -186,11 +188,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) + refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -199,7 +201,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r"\d", r)]) + tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -207,30 +209,28 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix) :] + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue if verbose: print("picking %s" % r) - return { - "version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": None, - "date": date, - } + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return { - "version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": "no suitable tags", - "date": None, - } + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -241,7 +241,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -249,24 +257,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command( - GITS, - [ - "describe", - "--tags", - "--dirty", - "--always", - "--long", - "--match", - "%s*" % tag_prefix, - ], - cwd=root, - ) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() @@ -276,6 +275,39 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out @@ -284,16 +316,17 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[: git_describe.rindex("-dirty")] + git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) return pieces # tag @@ -302,12 +335,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, - ) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -318,13 +349,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -355,25 +387,74 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). Exceptions: - 1: no tags. 0.post.devDISTANCE + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] else: # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] + rendered = "0.post0.dev%d" % pieces["distance"] return rendered @@ -404,12 +485,41 @@ def render_pep440_post(pieces): return rendered +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. - Eexceptions: + Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: @@ -469,23 +579,25 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return { - "version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None, - } + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": @@ -495,13 +607,9 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return { - "version": rendered, - "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], - "error": None, - "date": pieces.get("date"), - } + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} def get_versions(): @@ -515,7 +623,8 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) except NotThisMethod: pass @@ -524,16 +633,13 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split("/"): + for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None, - } + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -547,10 +653,6 @@ def get_versions(): except NotThisMethod: pass - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", - "date": None, - } + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/pyscicat/client.py b/pyscicat/client.py index f881768..1a06ac5 100644 --- a/pyscicat/client.py +++ b/pyscicat/client.py @@ -5,12 +5,22 @@ import hashlib import logging import json -from typing import List -import urllib +import re +from typing import Optional +from urllib.parse import urljoin, quote_plus +from pydantic import BaseModel import requests -from pyscicat.model import Attachment, Datablock, Dataset, RawDataset, DerivedDataset + +from pyscicat.model import ( + Attachment, + Dataset, + Instrument, + OrigDatablock, + Proposal, + Sample, +) logger = logging.getLogger("splash_ingest") can_debug = logger.isEnabledFor(logging.DEBUG) @@ -40,7 +50,7 @@ class ScicatClient: def __init__( self, - base_url: str = None, + base_url: str, token: str = False, username: str = None, password: str = None, @@ -69,225 +79,411 @@ def __init__( self._username = username # default username self._password = password # default password self._token = token # store token here - assert self._base_url is not None, "SciCat database URL must be provided" - - logger.info(f"Starting ingestor talking to scicat at: {self._base_url}") + self._headers = {} # store headers if not self._token: assert (self._username is not None) and ( self._password is not None ), "SciCat login credentials (username, password) must be provided if token is not provided" self._token = get_token(self._base_url, self._username, self._password) + self._headers["Authorization"] = "Bearer {}".format(self._token) - def _send_to_scicat(self, url, dataDict=None, cmd="post"): + def _send_to_scicat(self, cmd: str, endpoint: str, data: BaseModel = None): """sends a command to the SciCat API server using url and token, returns the response JSON Get token with the getToken method""" - if cmd == "post": - response = requests.post( - url, - params={"access_token": self._token}, - json=dataDict, - timeout=self._timeout_seconds, - stream=False, - verify=True, - ) - elif cmd == "delete": - response = requests.delete( - url, - params={"access_token": self._token}, - timeout=self._timeout_seconds, - stream=False, - ) - elif cmd == "get": - response = requests.get( - url, - params={"access_token": self._token}, - json=dataDict, - timeout=self._timeout_seconds, - stream=False, - ) - elif cmd == "patch": - response = requests.patch( - url, - params={"access_token": self._token}, - json=dataDict, - timeout=self._timeout_seconds, - stream=False, - ) - return response - - # Future support for samples - # def upload_sample(self, sample): - # sample = { - # "sampleId": projected_start_doc.get('sample_id'), - # "owner": projected_start_doc.get('pi_name'), - # "description": projected_start_doc.get('sample_name'), - # "createdAt": datetime.isoformat(datetime.utcnow()) + "Z", - # "sampleCharacteristics": {}, - # "isPublished": False, - # "ownerGroup": owner_group, - # "accessGroups": access_groups, - # "createdBy": self._username, - # "updatedBy": self._username, - # "updatedAt": datetime.isoformat(datetime.utcnow()) + "Z" - # } - # sample_url = f'{self._base_url}Samples' - - # resp = self._send_to_scicat(sample_url, sample) - # if not resp.ok: # can happen if sample id is a duplicate, but we can't tell that from the response - # err = resp.json()["error"] - # raise ScicatCommError(f"Error creating Sample {err}") - - def upload_dataset(self, dataset: Dataset) -> str: - """Upload a raw or derived dataset (method is autosensing) + return requests.request( + method=cmd, + url=urljoin(self._base_url, endpoint), + json=data.dict(exclude_none=True) if data is not None else None, + params={"access_token": self._token}, + headers=self._headers, + timeout=self._timeout_seconds, + stream=False, + verify=True, + ) + + def _call_endpoint( + self, + cmd: str, + endpoint: str, + data: BaseModel = None, + operation: str = "", + allow_404=False, + ) -> Optional[dict]: + response = self._send_to_scicat(cmd=cmd, endpoint=endpoint, data=data) + result = response.json() + if not response.ok: + err = result.get("error", {}) + if ( + allow_404 + and response.status_code == 404 + and re.match(r"Unknown (.+ )?id", err.get("message", "")) + ): + # The operation failed but because the object does not exist in SciCat. + logger.error("Error in operation %s: %s", operation, err) + return None + raise ScicatCommError(f"Error in operation {operation}: {err}") + logger.info( + "Operation '%s' successful%s", + operation, + f"pid={result['pid']}" if "pid" in result else "", + ) + return result + + def datasets_create(self, dataset: Dataset) -> str: + """ + Upload a new dataset. Uses the generic dataset endpoint. + Relies on the endpoint to sense the dataset type + This function was renamed. + It is still accessible with the original name for backward compatibility + The original name were create_dataset and upload_new_dataset Parameters ---------- dataset : Dataset - Dataset to load + Dataset to create Returns ------- str - pid (or unique identifier) of the newly created dataset + pid of the dataset Raises ------ ScicatCommError Raises if a non-20x message is returned """ - if isinstance(dataset, RawDataset): - dataset_url = self._base_url + "RawDataSets/replaceOrCreate" - elif isinstance(dataset, DerivedDataset): - dataset_url = self._base_url + "DerivedDatasets/replaceOrCreate" - else: - logging.error( - "Dataset type not recognized (not Derived or Raw dataset instances)" - ) - resp = self._send_to_scicat(dataset_url, dataset.dict(exclude_none=True)) - if not resp.ok: - err = resp.json()["error"] - raise ScicatCommError(f"Error creating dataset {err}") - new_pid = resp.json().get("pid") - logger.info(f"new dataset created {new_pid}") - return new_pid - - def upload_raw_dataset(self, dataset: Dataset) -> str: - """Upload a raw dataset + return self._call_endpoint( + cmd="post", endpoint="Datasets", data=dataset, operation="datasets_create" + ).get("pid") + + """ + Upload a new dataset + Original name, kept for for backward compatibility + """ + upload_new_dataset = datasets_create + create_dataset = datasets_create + + def datasets_update(self, dataset: Dataset, pid: str) -> str: + """Updates an existing dataset + This function was renamed. + It is still accessible with the original name for backward compatibility + The original name was update_dataset. Parameters ---------- dataset : Dataset - Dataset to load + Dataset to update + + pid + pid (or unique identifier) of dataset being updated Returns ------- str - pid (or unique identifier) of the newly created dataset + pid (or unique identifier) of the dataset + Raises + ------ + ScicatCommError + Raises if a non-20x message is returned + """ + return self._call_endpoint( + cmd="patch", + endpoint=f"Datasets/{quote_plus(pid)}", + data=dataset, + operation="datasets_update", + ).get("pid") + + """ + Update a dataset + Original name, kept for for backward compatibility + """ + update_dataset = datasets_update + + def datasets_origdatablock_create(self, origdatablock: OrigDatablock) -> dict: + """ + Create a new SciCat Dataset OrigDatablock + This function has been renamed. + It is still accessible with the original name for backward compatibility + The original names were create_dataset_origdatablock and upload_dataset_origdatablock + + Parameters + ---------- + origdatablock : + The OrigDatablock to create + + Returns + ------- + dict + The created OrigDatablock with id Raises ------ ScicatCommError Raises if a non-20x message is returned + """ - raw_dataset_url = self._base_url + "RawDataSets/replaceOrCreate" - resp = self._send_to_scicat(raw_dataset_url, dataset.dict(exclude_none=True)) - if not resp.ok: - err = resp.json()["error"] - raise ScicatCommError(f"Error creating raw dataset {err}") - new_pid = resp.json().get("pid") - logger.info(f"new dataset created {new_pid}") - return new_pid + endpoint = f"Datasets/{quote_plus(origdatablock.datasetId)}/origdatablocks" + return self._call_endpoint( + cmd="post", + endpoint=endpoint, + data=origdatablock, + operation="datasets_origdatablock_create", + ) - def upload_derived_dataset(self, dataset: Dataset) -> str: - """Upload a derived dataset + """ + Create a new SciCat Dataset OrigDatablock + Original name, kept for for backward compatibility + """ + upload_dataset_origdatablock = datasets_origdatablock_create + create_dataset_origdatablock = datasets_origdatablock_create + + def datasets_attachment_create( + self, attachment: Attachment, datasetType: str = "Datasets" + ) -> dict: + """ + Create a new Attachment for a dataset. + Note that datasetType can be provided to determine the type of dataset + that this attachment is attached to. This is required for creating the url that SciCat uses. + This function has been renamed. + It is still accessible with the original name for backward compatibility + The original names were create_dataset_attachment and upload_attachment Parameters ---------- - dataset : Dataset - Dataset to upload + attachment : Attachment + Attachment to upload + + datasetType : str + Type of dataset to upload to, default is `Datasets` + Raises + ------ + ScicatCommError + Raises if a non-20x message is returned + """ + endpoint = f"{datasetType}/{quote_plus(attachment.datasetId)}/attachments" + return self._call_endpoint( + cmd="post", + endpoint=endpoint, + data=attachment, + operation="datasets_attachment_create", + ) + + """ + Create a new attachement for a dataset + Original name, kept for for backward compatibility + """ + upload_attachment = datasets_attachment_create + create_dataset_attachment = datasets_attachment_create + + def samples_create(self, sample: Sample) -> str: + """ + Create a new sample. + An error is raised when a sample with the same sampleId already exists. + This function is also accessible as upload_sample. + + + Parameters + ---------- + sample : Sample + Sample to upload Returns ------- str - pid (or unique identifier) of the newly created dataset + ID of the newly created sample Raises ------ ScicatCommError Raises if a non-20x message is returned """ - derived_dataset_url = self._base_url + "DerivedDataSets/replaceOrCreate" - resp = self._send_to_scicat( - derived_dataset_url, dataset.dict(exclude_none=True) - ) - if not resp.ok: - err = resp.json()["error"] - raise ScicatCommError(f"Error creating raw dataset {err}") - new_pid = resp.json().get("pid") - logger.info(f"new dataset created {new_pid}") - return new_pid + return self._call_endpoint( + cmd="post", + endpoint="Samples", + data=sample, + operation="samples_create", + ).get("sampleId") + + upload_sample = samples_create + + def samples_update(self, sample: Sample, sampleId: str = None) -> str: + """Updates an existing sample + + Parameters + ---------- + sample : Sample + Sample to update + + sampleId + ID of sample being updated. By default, ID is taken from sample parameter. + + Returns + ------- + str + ID of the sample + + Raises + ------ + ScicatCommError + Raises if a non-20x message is returned + + AssertionError + Raises if no ID is provided + """ + if sampleId is None: + assert sample.sampleId is not None, "sampleId should not be None" + sampleId = sample.sampleId + sample.sampleId = None + return self._call_endpoint( + cmd="patch", + endpoint=f"Samples/{quote_plus(sampleId)}", + data=sample, + operation="samples_update", + ).get("sampleId") + + def instruments_create(self, instrument: Instrument): + """ + Create a new instrument. + Note that in SciCat admin rights are required to upload instruments. + An error is raised when an instrument with the same pid already exists. + This function is also accessible as upload_instrument. - def upload_datablock(self, datablock: Datablock, datasetType: str = "RawDatasets"): - """Upload a Datablock Parameters ---------- - datablock : Datablock - Datablock to upload + instrument : Instrument + Instrument to upload + + Returns + ------- + str + pid (or unique identifier) of the newly created instrument Raises ------ ScicatCommError Raises if a non-20x message is returned """ + return self._call_endpoint( + cmd="post", + endpoint="Instruments", + data=instrument, + operation="instruments_create", + ).get("pid") - url = ( - self._base_url - + f"{datasetType}/{urllib.parse.quote_plus(datablock.datasetId)}/origdatablocks" - ) - resp = self._send_to_scicat(url, datablock.dict(exclude_none=True)) - if not resp.ok: - err = resp.json()["error"] - raise ScicatCommError(f"Error creating datablock. {err}") + upload_instrument = instruments_create - def upload_attachment( - self, attachment: Attachment, datasetType: str = "RawDatasets" - ): - """Upload an Attachment. Note that datasetType can be provided to determine the type of dataset - that this attachment is attached to. This is required for creating the url that SciCat uses. + def instruments_update(self, instrument: Instrument, pid: str = None) -> str: + """Updates an existing instrument. + Note that in SciCat admin rights are required to upload instruments. Parameters ---------- - attachment : Attachment - Attachment to upload + instrument : Instrument + Instrument to update + + pid + pid (or unique identifier) of instrument being updated. + By default, pid is taken from instrument parameter. + + Returns + ------- + str + ID of the instrument - datasetType : str - Type of dataset to upload to, default is `RawDatasets` Raises ------ ScicatCommError Raises if a non-20x message is returned + + AssertionError + Raises if no ID is provided """ - url = ( - self._base_url - + f"{datasetType}/{urllib.parse.quote_plus(attachment.datasetId)}/attachments" - ) - logging.debug(url) - resp = requests.post( - url, - params={"access_token": self._token}, - timeout=self._timeout_seconds, - stream=False, - json=attachment.dict(exclude_none=True), - verify=True, - ) - if not resp.ok: - err = resp.json()["error"] - raise ScicatCommError(f"Error uploading thumbnail. {err}") + if pid is None: + assert instrument.pid is not None, "pid should not be None" + pid = instrument.pid + instrument.pid = None + return self._call_endpoint( + cmd="patch", + endpoint=f"Instruments/{quote_plus(pid)}", + data=instrument, + operation="instruments_update", + ).get("pid") + + def proposals_create(self, proposal: Proposal): + """ + Create a new proposal. + Note that in SciCat admin rights are required to upload proposals. + An error is raised when a proposal with the same proposalId already exists. + This function is also accessible as upload_proposal. + - def get_datasets_full_query(self, skip=0, limit=25, query_fields=None): - """Gets datasets using the fullQuery mechanism of SciCat. This is + Parameters + ---------- + proposal : Proposal + Proposal to upload + + Returns + ------- + str + ID of the newly created proposal + + Raises + ------ + ScicatCommError + Raises if a non-20x message is returned + """ + return self._call_endpoint( + cmd="post", + endpoint="Proposals", + data=proposal, + operation="proposals_create", + ).get("proposalId") + + upload_proposal = proposals_create + + def proposals_update(self, proposal: Proposal, proposalId: str = None) -> str: + """Updates an existing proposal. + Note that in SciCat admin rights are required to upload proposals. + + Parameters + ---------- + proposal : Proposal + Proposal to update + + proposalId + ID of proposal being updated. By default, this is taken from proposal parameter. + + Returns + ------- + str + ID of the proposal + + Raises + ------ + ScicatCommError + Raises if a non-20x message is returned + + AssertionError + Raises if no ID is provided + """ + if proposalId is None: + assert proposal.proposalId is not None, "proposalId should not be None" + proposalId = proposal.proposalId + proposal.proposalId = None + return self._call_endpoint( + cmd="patch", + endpoint=f"Proposals/{quote_plus(proposalId)}", + data=proposal, + operation="proposals_update", + ).get("proposalId") + + def datasets_find( + self, skip: int = 0, limit: int = 25, query_fields: Optional[dict] = None + ) -> Optional[dict]: + """ + Gets datasets using the fullQuery mechanism of SciCat. This is appropriate for cases where might want paging and cases where you want to perform a text search on the Datasets collection. The full features of fullQuery search are beyond this document. @@ -297,6 +493,10 @@ def get_datasets_full_query(self, skip=0, limit=25, query_fields=None): To query based on the full text search, send `{"text": " List[Dataset]: - """Gets datasets using the simple fiter mechanism. This + """ + find a set of datasets according the full query provided + Original name, kept for for backward compatibility + """ + get_datasets_full_query = datasets_find + find_datasets_full_query = datasets_find + + def datasets_get_many(self, filter_fields: Optional[dict] = None) -> Optional[dict]: + """ + Gets datasets using the simple fiter mechanism. This is appropriate when you do not require paging or text search, but want to be able to limit results based on items in the Dataset object. + This function has been renamed and the old name has been mantained for backward compatibility + The previous names are find_datasets and get_datasets For example, a search for Datasets of a given proposalId would have ```python @@ -335,6 +544,10 @@ def get_datasets(self, filter_fields=None) -> List[Dataset]: ```python filterField = {"proposalId": ""} ``` + If you want to search on partial strings, you can use "like": + ```python + filterField = {"proposalId": {"like":"123"}} + ``` Parameters ---------- @@ -343,26 +556,211 @@ def get_datasets(self, filter_fields=None) -> List[Dataset]: """ if not filter_fields: filter_fields = {} - filter_fields = json.dumps(filter_fields) - url = f'{self._base_url}/Datasets/?filter={{"where":{filter_fields}}}' - response = self._send_to_scicat(url, cmd="get") - if not response.ok: - err = response.json()["error"] - logger.error(f'{err["name"]}, {err["statusCode"]}: {err["message"]}') - return None - return response.json() - - # this method is future, needs testing. - # def update_dataset(self, pid, fields: Dict): - # response = self._send_to_scicat( - # f"{self._base_url}/Datasets", dataDict=fields, cmd="patch" - # ) - # if not response.ok: - # err = response.json()["error"] - # logger.error(f'{err["name"]}, {err["statusCode"]}: {err["message"]}') - # return None - # return response.json() + endpoint = f'Datasets?filter={{"where":{filter_fields}}}' + return self._call_endpoint( + cmd="get", endpoint=endpoint, operation="datasets_get_many", allow_404=True + ) + + """ + find a set of datasets according to the simple filter provided + Original name, kept for for backward compatibility + """ + get_datasets = datasets_get_many + find_datasets = datasets_get_many + + def published_data_get_many(self, filter=None) -> Optional[dict]: + """ + retrieve all the published data using the simple fiter mechanism. This + is appropriate when you do not require paging or text search, but + want to be able to limit results based on items in the Dataset object. + This function has been renamed and the old name has been maintained for backward compatibility + The previous name are find_published_data and get_published_data + + For example, a search for published data of a given doi would have + ```python + filter = {"doi": "1234"} + ``` + + Parameters + ---------- + filter : dict + Dictionary of filtering fields. Must be json serializable. + """ + if filter: + filter = json.dumps(filter) + + endpoint = "PublishedData" + (f'?filter={{"where":{filter}}}' if filter else "") + return self._call_endpoint( + cmd="get", + endpoint=endpoint, + operation="published_data_get_many", + allow_404=True, + ) + + """ + find a set of published data according to the simple filter provided + Original name, kept for for backward compatibility + """ + get_published_data = published_data_get_many + find_published_data = published_data_get_many + + def datasets_get_one(self, pid: str) -> Optional[dict]: + """ + Gets dataset with the pid provided. + This function has been renamed. Provious name has been maintained for backward compatibility. + Previous names was get_dataset_by_pid + + Parameters + ---------- + pid : string + pid of the dataset requested. + """ + return self._call_endpoint( + cmd="get", + endpoint=f"Datasets/{quote_plus(pid)}", + operation="datasets_get_one", + allow_404=True, + ) + + get_dataset_by_pid = datasets_get_one + + def instruments_get_one(self, pid: str = None, name: str = None) -> Optional[dict]: + """ + Get an instrument by pid or by name. + If pid is provided it takes priority over name. + + This function has been renamed. Previous name has been maintained for backward compatibility. + Previous name was get_instrument + + Parameters + ---------- + pid : str + Pid of the instrument + + name : str + The name of the instrument + + Returns + ------- + dict + The instrument with the requested name + """ + + if pid: + endpoint = f"Instruments/{quote_plus(pid)}" + elif name: + query = json.dumps({"where": {"name": {"like": name}}}) + endpoint = f"Instruments/findOne?{query}" + else: + raise ValueError("You must specify instrument pid or name") + + return self._call_endpoint( + cmd="get", + endpoint=endpoint, + operation="instruments_get_one", + allow_404=True, + ) + + get_instrument = instruments_get_one + + def samples_get_one(self, pid: str) -> Optional[dict]: + """ + Get a sample by pid. + This function has been renamed. Previous name has been maintained for backward compatibility. + Previous name was get_sample + + + Parameters + ---------- + pid : str + The pid of the sample + + Returns + ------- + dict + The sample with the requested pid + """ + return self._call_endpoint( + cmd="get", + endpoint=f"Samples/{quote_plus(pid)}", + operation="samples_get_one", + allow_404=True, + ) + + get_sample = samples_get_one + + def proposals_get_one(self, pid: str = None) -> Optional[dict]: + """ + Get proposal by pid. + This function has been renamed. Previous name has been maintained for backward compatibility. + Previous name was get_proposal + + Parameters + ---------- + pid : str + The pid of the proposal + + Returns + ------- + dict + The proposal with the requested pid + """ + return self._call_endpoint( + cmd="get", endpoint=f"Proposals/{quote_plus(pid)}", allow_404=True + ) + + get_proposal = proposals_get_one + + def datasets_origdatablocks_get_one(self, pid: str) -> Optional[dict]: + """ + Get dataset orig datablocks by dataset pid. + This function has been renamed. Previous name has been maintained for backward compatibility. + Previous name was get_dataset_origdatablocks + + Parameters + ---------- + pid : str + The pid of the dataset + + Returns + ------- + dict + The orig_datablocks of the dataset with the requested pid + """ + return self._call_endpoint( + cmd="get", + endpoint=f"Datasets/{quote_plus(pid)}/origdatablocks", + operation="datasets_origdatablocks_get_one", + allow_404=True, + ) + + get_dataset_origdatablocks = datasets_origdatablocks_get_one + + def datasets_delete(self, pid: str) -> Optional[dict]: + """ + Delete dataset by pid + This function has been renamed. Previous name has been maintained for backward compatibility. + Previous name was delete_dataset + + Parameters + ---------- + pid : str + The pid of the dataset to be deleted + + Returns + ------- + dict + response from SciCat backend + """ + return self._call_endpoint( + cmd="delete", + endpoint=f"Datasets/{quote_plus(pid)}", + operation="datasets_delete", + allow_404=True, + ) + + delete_dataset = datasets_delete def get_file_size(pathobj): @@ -401,26 +799,55 @@ def from_credentials(base_url: str, username: str, password: str): return from_token(base_url, token) -def get_token(base_url, username, password): - """logs in using the provided username / password combination - and receives token for further communication use""" - logger.info(f" Getting new token for user {username}") - if base_url[-1] != "/": - base_url = base_url + "/" +def _log_in_via_users_login(base_url, username, password): response = requests.post( - base_url + "Users/login", + urljoin(base_url, "Users/login"), + json={"username": username, "password": password}, + stream=False, + verify=True, + ) + if not response.ok: + logger.info(f" Failed to log in via endpoint Users/login: {response.json()}") + return response + + +def _log_in_via_auth_msad(base_url, username, password): + import re + + # Strip the api/vn suffix + base_url = re.sub(r"/api/v\d+/?", "", base_url) + response = requests.post( + urljoin(base_url, "auth/msad"), json={"username": username, "password": password}, stream=False, verify=True, ) if not response.ok: - logger.error(f" ** Error received: {response}") err = response.json()["error"] - logger.error(f' {err["name"]}, {err["statusCode"]}: {err["message"]}') + logger.error( + f'Error retrieving token for user: {err["name"]}, {err["statusCode"]}: {err["message"]}' + ) raise ScicatLoginError(response.content) - data = response.json() - # print("Response:", data) - token = data["id"] # not sure if semantically correct - logger.info(f" token: {token}") - return token + +def get_token(base_url, username, password): + """logs in using the provided username / password combination + and receives token for further communication use""" + # Users/login only works for functional accounts and auth/msad for regular users. + # Try both and see what works. This is not nice but seems to be the only + # feasible solution right now. + logger.info(" Getting new token") + + response = _log_in_via_users_login(base_url, username, password) + if response.ok: + return response.json()["id"] # not sure if semantically correct + + response = _log_in_via_auth_msad(base_url, username, password) + if response.ok: + return response.json()["access_token"] + + err = response.json()["error"] + logger.error( + f' Failed log in: {err["name"]}, {err["statusCode"]}: {err["message"]}' + ) + raise ScicatLoginError(response.content) diff --git a/pyscicat/model.py b/pyscicat/model.py index f9f26fc..f7251be 100644 --- a/pyscicat/model.py +++ b/pyscicat/model.py @@ -13,20 +13,21 @@ class DatasetType(str, enum.Enum): derived = "derived" -class Ownable(BaseModel): - """Many objects in SciCat are ownable""" +class MongoQueryable(BaseModel): + """Many objects in SciCat are mongo queryable""" - ownerGroup: str - accessGroups: List[str] + createdBy: Optional[str] = None + updatedBy: Optional[str] = None + updatedAt: Optional[str] = None + createdAt: Optional[str] = None -class MongoQueryable(BaseModel): - """Many objects in SciCat are mongo queryable""" +class Ownable(MongoQueryable): + """Many objects in SciCat are ownable""" - createdBy: Optional[str] - updatedBy: Optional[str] - updatedAt: Optional[str] - createdAt: Optional[str] + ownerGroup: Optional[str] = None + accessGroups: Optional[List[str]] = None + instrumentGroup: Optional[str] = None class User(BaseModel): @@ -40,39 +41,37 @@ class User(BaseModel): id: str -class Proposal(Ownable, MongoQueryable): +class Proposal(Ownable): """ Defines the purpose of an experiment and links an experiment to principal investigator and main proposer """ - # TODO: find out which of these are not optional and update - proposalId: Optional[str] - pi_email: Optional[str] - pi_firstname: Optional[str] - pi_lastname: Optional[str] - email: Optional[str] - firstname: Optional[str] - lastname: Optional[str] - title: Optional[str] - abstract: Optional[str] - startTime: Optional[str] - endTime: Optional[str] + proposalId: str + pi_email: Optional[str] = None + pi_firstname: Optional[str] = None + pi_lastname: Optional[str] = None + email: str + firstname: Optional[str] = None + lastname: Optional[str] = None + title: Optional[str] = None # required in next backend version + abstract: Optional[str] = None + startTime: Optional[str] = None + endTime: Optional[str] = None MeasurementPeriodList: Optional[ List[dict] - ] # may need updating with the measurement period model + ] = None # may need updating with the measurement period model -class Sample(Ownable, MongoQueryable): +class Sample(Ownable): """ Models describing the characteristics of the samples to be investigated. Raw datasets should be linked to such sample definitions. """ - # TODO: find out which of these are not optional and update - sampleId: Optional[str] - owner: Optional[str] - description: Optional[str] - sampleCharacteristics: Optional[dict] + sampleId: Optional[str] = None + owner: Optional[str] = None + description: Optional[str] = None + sampleCharacteristics: Optional[dict] = None isPublished: bool = False @@ -84,15 +83,15 @@ class Job(MongoQueryable): track of analysis jobs e.g. for automated analysis workflows """ - id: Optional[str] + id: Optional[str] = None emailJobInitiator: str type: str - creationTime: Optional[str] # not sure yet which ones are optional or not. - executionTime: Optional[str] - jobParams: Optional[dict] - jobStatusMessage: Optional[str] - datasetList: Optional[dict] # documentation says dict, but should maybe be list? - jobResultObject: Optional[dict] # ibid. + creationTime: Optional[str] = None # not sure yet which ones are optional or not. + executionTime: Optional[str] = None + jobParams: Optional[dict] = None + jobStatusMessage: Optional[str] = None + datasetList: Optional[dict] = None # documentation says dict, but should maybe be list? + jobResultObject: Optional[dict] = None # ibid. class Instrument(MongoQueryable): @@ -100,41 +99,42 @@ class Instrument(MongoQueryable): Instrument class, most of this is flexibly definable in customMetadata """ - pid: Optional[str] + pid: Optional[str] = None name: str - customMetadata: Optional[dict] + customMetadata: Optional[dict] = None -class Dataset(Ownable, MongoQueryable): +class Dataset(Ownable): """ A dataset in SciCat, base class for derived and raw datasets """ - pid: Optional[str] - classification: Optional[str] + pid: Optional[str] = None + classification: Optional[str] = None contactEmail: str creationTime: str # datetime - datasetName: Optional[str] - description: Optional[str] - history: Optional[List[dict]] # list of foreigh key ids to the Messages table - instrumentId: Optional[str] + datasetName: Optional[str] = None + description: Optional[str] = None + history: Optional[List[dict]] = None # list of foreigh key ids to the Messages table + instrumentId: Optional[str] = None isPublished: Optional[bool] = False - keywords: Optional[List[str]] - license: Optional[str] - numberOfFiles: Optional[int] - numberOfFilesArchived: Optional[int] - orcidOfOwner: Optional[str] - packedSize: Optional[int] + keywords: Optional[List[str]] = None + license: Optional[str] = None + numberOfFiles: Optional[int] = None + numberOfFilesArchived: Optional[int] = None + orcidOfOwner: Optional[str] = None + packedSize: Optional[int] = None owner: str - ownerEmail: Optional[str] - sharedWith: Optional[List[str]] - size: Optional[int] + ownerEmail: Optional[str] = None + sharedWith: Optional[List[str]] = None + size: Optional[int] = None sourceFolder: str - sourceFolderHost: Optional[str] - techniques: Optional[List[dict]] # with {'pid':pid, 'name': name} as entries + sourceFolderHost: Optional[str] = None + techniques: Optional[List[dict]] = None # with {'pid':pid, 'name': name} as entries type: DatasetType - validationStatus: Optional[str] - version: Optional[str] + validationStatus: Optional[str] = None + version: Optional[str] = None + scientificMetadata: Optional[Dict] = None class RawDataset(Dataset): @@ -142,17 +142,13 @@ class RawDataset(Dataset): Raw datasets from which derived datasets are... derived. """ - principalInvestigator: Optional[str] - creationLocation: Optional[str] - dataFormat: str - type: DatasetType = "raw" - createdAt: Optional[str] # datetime - updatedAt: Optional[str] # datetime - dataFormat: Optional[str] - endTime: Optional[str] # datetime - sampleId: Optional[str] - proposalId: Optional[str] - scientificMetadata: Optional[Dict] + principalInvestigator: Optional[str] = None + creationLocation: Optional[str] = None + type: DatasetType = DatasetType.raw + dataFormat: Optional[str] = None + endTime: Optional[str] = None # datetime + sampleId: Optional[str] = None + proposalId: Optional[str] = None class DerivedDataset(Dataset): @@ -160,12 +156,12 @@ class DerivedDataset(Dataset): Derived datasets which have been generated based on one or more raw datasets """ - investigator: Optional[str] + investigator: str inputDatasets: List[str] - usedSoftware: List[str] # not optional! - jobParameters: Optional[dict] - jobLogData: Optional[str] - scientificMetadata: Optional[Dict] + usedSoftware: List[str] + jobParameters: Optional[dict] = None + jobLogData: Optional[str] = None + type: DatasetType = DatasetType.derived class DataFile(MongoQueryable): @@ -177,7 +173,8 @@ class DataFile(MongoQueryable): path: str size: int - time: Optional[str] + time: Optional[str] = None + chk: Optional[str] = None uid: Optional[str] = None gid: Optional[str] = None perm: Optional[str] = None @@ -188,12 +185,26 @@ class Datablock(Ownable): A Datablock maps between a Dataset and contains DataFiles """ - id: Optional[str] + id: Optional[str] = None # archiveId: str = None listed in catamel model, but comes back invalid? size: int - packedSize: Optional[int] - chkAlg: Optional[int] + packedSize: Optional[int] = None + chkAlg: Optional[int] = None version: str = None + instrumentGroup: Optional[str] = None + dataFileList: List[DataFile] + datasetId: str + + +class OrigDatablock(Ownable): + """ + An Original Datablock maps between a Dataset and contains DataFiles + """ + + id: Optional[str] = None + chkAlg :Optional[str] = None + size: int + instrumentGroup: Optional[str] = None dataFileList: List[DataFile] datasetId: str @@ -203,7 +214,35 @@ class Attachment(Ownable): Attachments can be any base64 encoded string...thumbnails are attachments """ - id: Optional[str] + id: Optional[str] = None thumbnail: str - caption: Optional[str] + caption: Optional[str] = None datasetId: str + + +class PublishedData: + """ + Published Data with registered DOI + """ + + doi: str + affiliation: str + creator: List[str] + publisher: str + publicationYear: int + title: str + url: Optional[str] = None + abstract: str + dataDescription: str + resourceType: str + numberOfFiles: Optional[int] = None + sizeOfArchive: Optional[int] = None + pidArray: List[str] + authors: List[str] + registeredTime: str + status: str + thumbnail: Optional[str] = None + createdBy: str + updatedBy: str + createdAt: str + updatedAt: str diff --git a/pyscicat/tests/test_client.py b/pyscicat/tests/test_client.py deleted file mode 100644 index 5c470e6..0000000 --- a/pyscicat/tests/test_client.py +++ /dev/null @@ -1,107 +0,0 @@ -from datetime import datetime -from pathlib import Path - -import requests_mock -from ..client import ( - from_credentials, - from_token, - encode_thumbnail, - get_file_mod_time, - get_file_size, -) - -from ..model import ( - Attachment, - Datablock, - DataFile, - Dataset, - Ownable, -) - -local_url = "http://localhost:3000/api/v3/" - - -def add_mock_requests(mock_request): - mock_request.post( - local_url + "Users/login", - json={"id": "a_token"}, - ) - mock_request.post(local_url + "Samples", json={"sampleId": "dataset_id"}) - mock_request.post(local_url + "RawDatasets/replaceOrCreate", json={"pid": "42"}) - mock_request.post( - local_url + "RawDatasets/42/origdatablocks", - json={"response": "random"}, - ) - mock_request.post( - local_url + "RawDatasets/42/attachments", - json={"response": "random"}, - ) - - -def test_scicate_ingest(): - with requests_mock.Mocker() as mock_request: - add_mock_requests(mock_request) - scicat = from_credentials( - base_url=local_url, - username="Zaphod", - password="heartofgold", - ) - assert ( - scicat._token == "a_token" - ), "scicat client set the token given by the server" - - ownable = Ownable(ownerGroup="magrathea", accessGroups=["deep_though"]) - thumb_path = Path(__file__).parent / "data/SciCatLogo.png" - - time = get_file_mod_time(thumb_path) - assert time is not None - size = get_file_size(thumb_path) - assert size is not None - - # RawDataset - dataset = Dataset( - path="/foo/bar", - size=42, - owner="slartibartfast", - contactEmail="slartibartfast@magrathea.org", - creationLocation="magrathea", - creationTime=str(datetime.now()), - type="raw", - instrumentId="earth", - proposalId="deepthought", - dataFormat="planet", - principalInvestigator="A. Mouse", - sourceFolder="/foo/bar", - scientificMetadata={"a": "field"}, - sampleId="gargleblaster", - **ownable.dict() - ) - dataset_id = scicat.upload_raw_dataset(dataset) - - # Datablock with DataFiles - data_file = DataFile(path="/foo/bar", size=42) - data_block = Datablock( - size=42, - version=1, - datasetId=dataset_id, - dataFileList=[data_file], - **ownable.dict() - ) - scicat.upload_datablock(data_block) - - # Attachment - attachment = Attachment( - datasetId=dataset_id, - thumbnail=encode_thumbnail(thumb_path), - caption="scattering image", - **ownable.dict() - ) - scicat.upload_attachment(attachment) - - -def test_initializers(): - with requests_mock.Mocker() as mock_request: - add_mock_requests(mock_request) - - client = from_token(local_url, "let me in!") - assert client._token == "let me in!" diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 2759b55..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,22 +0,0 @@ -# These are required for developing the package (running the tests, building -# the documentation) but not necessarily required for _using_ it. -codecov -coverage -flake8 -pytest -sphinx -twine -pre-commit -black -nbstripout -requests_mock -# These are dependencies of various sphinx extensions for documentation. -ipython -matplotlib -mistune <2.0.0 # temporary while sphinx sorts this out -myst-parser -numpydoc -sphinx-click -sphinx-copybutton -sphinxcontrib.openapi -sphinx_rtd_theme diff --git a/requirements-hdf5.txt b/requirements-hdf5.txt deleted file mode 100644 index c3b2f48..0000000 --- a/requirements-hdf5.txt +++ /dev/null @@ -1,2 +0,0 @@ -hdf5plugin -h5py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 76aa8db..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pydantic -requests \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index c80108f..e87f52d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,59 @@ style = pep440-post versionfile_source = pyscicat/_version.py versionfile_build = pyscicat/_version.py tag_prefix = v + +[metadata] + +name = pyscicat +description = a python API to communicate with the Scicat API +long_description = file: README.md +long_description_content_type = text/markdown +author = Dylan McReynolds +author_email = dmcreynolds@lbl.gov +url = https://github.com/scicatproject/pyscicat +license_files = LICENSE +license=BSD (3-clause) +classifiers= + Development Status :: 2 - Pre-Alpha + Natural Language :: English + Programming Language :: Python :: 3.7 + + +[options] +include_package_data = True +packages= find: +install_requires = + pydantic + requests + +python_requires = >=3.7 + + + +[options.extras_require] +hdf5 = + hdf5plugin + h5py +dev = + codecov + coverage + flake8 + pytest + sphinx + twine + black + requests_mock +docs = + ipython + matplotlib + mistune <2.0.0 # temporary while sphinx sorts this out + myst-parser + numpydoc + sphinx-click + sphinx-copybutton + sphinxcontrib.openapi + sphinx_rtd_theme + +[options.packages.find] +exclude = + continuous_integration diff --git a/setup.py b/setup.py index 9ecf6de..1d0c607 100644 --- a/setup.py +++ b/setup.py @@ -1,68 +1,7 @@ -from pathlib import Path -from setuptools import setup, find_packages -import sys +from setuptools import setup import versioneer -min_version = (3, 7) -if sys.version_info < min_version: - error = """ -pyscicat does not support Python {0}.{1}. -Python {2}.{3} and above is required. Check your Python version like so: - -python3 --version - -This may be due to an out-of-date pip. Make sure you have pip >= 9.0.1. -Upgrade pip like so: - -pip install --upgrade pip -""".format( - *(sys.version_info[:2] + min_version) - ) - sys.exit(error) - -here = Path(__file__).absolute() - -with open(here.with_name("README.md"), encoding="utf-8") as readme_file: - readme = readme_file.read() - - -def read_requirements_from_here(here: Path, filename: str = None) -> list: - assert filename is not None, "filename as string must be provided" - assert here.with_name( - filename - ).exists(), f"requirements filename {filename.as_posix()} does not exist" - with open(here.with_name(filename)) as requirements_file: - # Parse requirements.txt, ignoring any commented-out lines. - requirements = [ - line - for line in requirements_file.read().splitlines() - if not line.startswith("#") - ] - return requirements - - -extras_require = {} -extras_require["base"] = read_requirements_from_here(here, "requirements.txt") -extras_require["h5tools"] = read_requirements_from_here(here, "requirements-hdf5.txt") - setup( - name="pyscicat", version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - description="Code for communicating to a SciCat backend server python", - long_description=readme, - author="Dylan McReynolds", - author_email="dmcreynolds@lbl.gov", - url="https://github.com/scicatproject/pyscicat", - python_requires=">={}".format(".".join(str(n) for n in min_version)), - packages=find_packages(exclude=["docs", "tests"]), - include_package_data=True, - extras_require=extras_require, - install_requires=extras_require["base"], - license="BSD (3-clause)", - classifiers=[ - "Development Status :: 2 - Pre-Alpha", - "Natural Language :: English", - "Programming Language :: Python :: 3", - ], + cmdclass=versioneer.get_cmdclass() ) diff --git a/pyscicat/tests/__init__.py b/tests/test_hdf5/__init__.py similarity index 100% rename from pyscicat/tests/__init__.py rename to tests/test_hdf5/__init__.py diff --git a/pyscicat/hdf5/_tests/testdata/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs b/tests/test_hdf5/data/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs similarity index 100% rename from pyscicat/hdf5/_tests/testdata/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs rename to tests/test_hdf5/data/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs diff --git a/pyscicat/hdf5/_tests/test_hdf5sct.py b/tests/test_hdf5/test_hdf5sct.py similarity index 83% rename from pyscicat/hdf5/_tests/test_hdf5sct.py rename to tests/test_hdf5/test_hdf5sct.py index 466e190..3f3af64 100644 --- a/pyscicat/hdf5/_tests/test_hdf5sct.py +++ b/tests/test_hdf5/test_hdf5sct.py @@ -9,19 +9,19 @@ def test_readValue(): # more intelligent path finding: - p = sorted(Path(".").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] + p = sorted(Path("").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] v = h5Get(p, "/sasentry1/sasdata1/I") assert v != "none", "Did not extract value" def test_readAttribute(): - p = sorted(Path(".").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] + p = sorted(Path("").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] v = h5Get(p, "/sasentry1/sasdata1@timestamp") assert v != "none", "Did not extract attribute" def test_readMixedDict(): - p = sorted(Path(".").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] + p = sorted(Path("").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] v = h5GetDict( p, { @@ -38,7 +38,7 @@ def test_readMixedDict(): def test_readMetadata_withroot(): - p = sorted(Path(".").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] + p = sorted(Path("").glob("**/cylinderHex_r5_s12_T50_large_ranW_0p5.nxs"))[0] assert p.exists(), f"HDF5/NeXus test file: {p.as_posix()} cannot be found" resultDict = scientific_metadata(p, excludeRootEntry=True, skipKeyList=["sasdata1"]) assert resultDict is not None, "scientific_metadata has not returned anything" diff --git a/pyscicat/tests/conftest.py b/tests/test_pyscicat/__init__.py similarity index 100% rename from pyscicat/tests/conftest.py rename to tests/test_pyscicat/__init__.py diff --git a/tests/test_pyscicat/conftest.py b/tests/test_pyscicat/conftest.py new file mode 100644 index 0000000..e69de29 diff --git a/pyscicat/tests/data/SciCatLogo.png b/tests/test_pyscicat/data/SciCatLogo.png similarity index 100% rename from pyscicat/tests/data/SciCatLogo.png rename to tests/test_pyscicat/data/SciCatLogo.png diff --git a/tests/test_pyscicat/test_client.py b/tests/test_pyscicat/test_client.py new file mode 100644 index 0000000..46584f1 --- /dev/null +++ b/tests/test_pyscicat/test_client.py @@ -0,0 +1,230 @@ +from datetime import datetime +from pathlib import Path + +import pytest +import requests_mock +from pyscicat.client import ( + from_credentials, + from_token, + encode_thumbnail, + get_file_mod_time, + get_file_size, + ScicatCommError, +) + +from pyscicat.model import ( + Attachment, + Datablock, + DataFile, + Instrument, + Proposal, + RawDataset, + Sample, + Ownable, +) + +local_url = "http://localhost:3000/api/v3/" + + +def add_mock_requests(mock_request): + mock_request.post( + local_url + "Users/login", + json={"id": "a_token"}, + ) + + mock_request.post(local_url + "Instruments", json={"pid": "earth"}) + mock_request.post(local_url + "Proposals", json={"proposalId": "deepthought"}) + mock_request.post(local_url + "Samples", json={"sampleId": "gargleblaster"}) + mock_request.patch(local_url + "Instruments/earth", json={"pid": "earth"}) + mock_request.patch( + local_url + "Proposals/deepthought", json={"proposalId": "deepthought"} + ) + mock_request.patch( + local_url + "Samples/gargleblaster", json={"sampleId": "gargleblaster"} + ) + + mock_request.post(local_url + "RawDatasets/replaceOrCreate", json={"pid": "42"}) + mock_request.patch( + local_url + "Datasets/42", + json={"pid": "42"}, + ) + mock_request.post( + local_url + "Datasets/42/origdatablocks", + json={"response": "random"}, + ) + mock_request.post( + local_url + "Datasets/42/attachments", + json={"response": "random"}, + ) + + mock_request.post(local_url + "Datasets", json={"pid": "42"}) + + +def test_scicat_ingest(): + with requests_mock.Mocker() as mock_request: + add_mock_requests(mock_request) + scicat = from_credentials( + base_url=local_url, + username="Zaphod", + password="heartofgold", + ) + assert ( + scicat._token == "a_token" + ), "scicat client set the token given by the server" + + ownable = Ownable(ownerGroup="magrathea", accessGroups=["deep_though"]) + thumb_path = Path(__file__).parent / "data/SciCatLogo.png" + + time = get_file_mod_time(thumb_path) + assert time is not None + size = get_file_size(thumb_path) + assert size is not None + + # Instrument + instrument = Instrument( + pid="earth", name="Earth", customMetadata={"a": "field"} + ) + assert scicat.upload_instrument(instrument) == "earth" + assert scicat.instruments_create(instrument) == "earth" + assert scicat.instruments_update(instrument) == "earth" + + # Proposal + proposal = Proposal( + proposalId="deepthought", + title="Deepthought", + email="deepthought@viltvodle.com", + **ownable.dict() + ) + assert scicat.upload_proposal(proposal) == "deepthought" + assert scicat.proposals_create(proposal) == "deepthought" + assert scicat.proposals_update(proposal) == "deepthought" + + # Sample + sample = Sample( + sampleId="gargleblaster", + description="Gargleblaster", + sampleCharacteristics={"a": "field"}, + **ownable.dict() + ) + assert scicat.upload_sample(sample) == "gargleblaster" + assert scicat.samples_create(sample) == "gargleblaster" + assert scicat.samples_update(sample) == "gargleblaster" + + # RawDataset + dataset = RawDataset( + path="/foo/bar", + size=42, + owner="slartibartfast", + contactEmail="slartibartfast@magrathea.org", + creationLocation="magrathea", + creationTime=str(datetime.now()), + type="raw", + instrumentId="earth", + proposalId="deepthought", + dataFormat="planet", + principalInvestigator="A. Mouse", + sourceFolder="/foo/bar", + scientificMetadata={"a": "field"}, + sampleId="gargleblaster", + **ownable.dict() + ) + dataset_id = scicat.upload_new_dataset(dataset) + assert dataset_id == "42" + + # Update record + dataset.principalInvestigator = "B. Turtle" + dataset_id_2 = scicat.update_dataset(dataset, dataset_id) + assert dataset_id_2 == dataset_id + + # Datablock with DataFiles + data_file = DataFile(path="/foo/bar", size=42) + data_block = Datablock( + size=42, + version="1", + datasetId=dataset_id, + dataFileList=[data_file], + **ownable.dict() + ) + scicat.upload_dataset_origdatablock(data_block) + + # Attachment + attachment = Attachment( + datasetId=dataset_id, + thumbnail=encode_thumbnail(thumb_path), + caption="scattering image", + **ownable.dict() + ) + scicat.upload_attachment(attachment) + + +def test_get_dataset(): + with requests_mock.Mocker() as mock_request: + dataset = RawDataset( + size=42, + owner="slartibartfast", + contactEmail="slartibartfast@magrathea.org", + creationLocation="magrathea", + creationTime=str(datetime.now()), + instrumentId="earth", + proposalId="deepthought", + dataFormat="planet", + principalInvestigator="A. Mouse", + sourceFolder="/foo/bar", + scientificMetadata={"a": "field"}, + sampleId="gargleblaster", + ownerGroup="magrathea", + accessGroups=["deep_though"], + ) + mock_request.get( + local_url + "Datasets/123", json=dataset.dict(exclude_none=True) + ) + + client = from_token(base_url=local_url, token="a_token") + retrieved = client.datasets_get_one("123") + assert retrieved == dataset.dict(exclude_none=True) + + +def test_get_nonexistent_dataset(): + with requests_mock.Mocker() as mock_request: + mock_request.get( + local_url + "Datasets/74", + status_code=404, + reason="Not Found", + json={ + "error": { + "statusCode": 404, + "name": "Error", + "message": 'Unknown "Dataset" id "74".', + "code": "MODEL_NOT_FOUND", + } + }, + ) + client = from_token(base_url=local_url, token="a_token") + assert client.datasets_get_one("74") is None + + +def test_get_dataset_bad_url(): + with requests_mock.Mocker() as mock_request: + mock_request.get( + "http://localhost:3000/api/v100/Datasets/53", + status_code=404, + reason="Not Found", + json={ + "error": { + "statusCode": 404, + "name": "Error", + "message": "Cannot GET /api/v100/Datasets/53", + } + }, + ) + client = from_token(base_url="http://localhost:3000/api/v100", token="a_token") + with pytest.raises(ScicatCommError): + client.datasets_get_one("53") + + +def test_initializers(): + with requests_mock.Mocker() as mock_request: + add_mock_requests(mock_request) + + client = from_token(local_url, "let me in!") + assert client._token == "let me in!" diff --git a/tests/tests_integration/__init__.py b/tests/tests_integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tests_integration/tests_integration.py b/tests/tests_integration/tests_integration.py new file mode 100644 index 0000000..86ad83c --- /dev/null +++ b/tests/tests_integration/tests_integration.py @@ -0,0 +1,94 @@ +from pyscicat.client import ScicatClient +from pyscicat.model import RawDataset, Ownable +from datetime import datetime +import os + + +""" +These test_pyscicat do not use mocks and are designed to connect + to a v4 service for Scicat backend. You can run this easily +in docker-compose following the repo +https://github.com/SciCatProject/scicatlive. +You will also need to use one of the default user accounts or add +your own. + +You will need to set environmental variables for +BASE_URL - the url of your scicat service e.g. http://localhost:3000/api/v3 +SCICAT_USER - the name of your scicat user. +SCICAT_PASSWORD - the password for your scicat user. +""" + +sci_clie = ScicatClient(base_url=os.environ["BASE_URL"], + token=None, + username=os.environ["SCICAT_USER"], + password=os.environ["SCICAT_PASSWORD"]) + + +def test_client(): + assert type(sci_clie) == ScicatClient # noqa: E721 + + +def test_upload_dataset(): + ownable = Ownable(ownerGroup="ingestor", accessGroups=[]) + payload = RawDataset( + datasetName="a new guide book", + path="/foo/bar", + size=42, + packedSize=0, + owner=os.environ["SCICAT_USER"], + contactEmail="slartibartfast@magrathea.org", + creationLocation="Magrathea", + creationTime=datetime.isoformat(datetime.now()), + instrumentId="earth", + proposalId="deepthought", + dataFormat="planet", + principalInvestigator="A. Mouse", + sourceFolder="/foo/bar", + scientificMetadata={"type": "string", "value": {"a": "field"}}, + sampleId="gargleblaster", + type="raw", + ownerEmail="scicatingestor@your.site", + sourceFolderHost="s3.heartofgold.org", + endTime=datetime.isoformat(datetime.now()), + techniques=[], + numberOfFiles=0, + numberOfFilesArchived=0, + **ownable.dict() + ) + + sci_clie.upload_new_dataset(payload) + + +def test_get_dataset(): + + datasets = sci_clie.get_datasets({"ownerGroup": "ingestor"}) + + for dataset in datasets: + assert dataset["ownerGroup"] == "ingestor" + + +def test_update_dataset(): + sci_clie = ScicatClient(base_url=os.environ["BASE_URL"], + token=None, + username=os.environ["SCICAT_USER"], + password=os.environ["SCICAT_PASSWORD"]) + + datasets = sci_clie.get_datasets({}) + pid = datasets[0]["pid"] + payload = RawDataset( + size=142, + owner="slartibartfast", + ownerGroup="Magrateheans", + contactEmail="slartibartfast@magrathea.org", + creationLocation="magrathea", + creationTime=datetime.isoformat(datetime.now()), + instrumentId="earth", + proposalId="deepthought", + dataFormat="planet", + principalInvestigator="A. Mouse", + sourceFolder="/foo/bar", + scientificMetadata={"a": "field"}, + sampleId="gargleblaster", + accessGroups=["Vogons"] + ) + sci_clie.update_dataset(payload, pid)