Merge pull request #52 from eapearson/fix-SCT-3001-and-3002

Fix SCT-3001, SCT-3002 and friends
kbase · Apr 21, 2021 · e94fe95 · e94fe95
2 parents a075505 + 26173ab
commit e94fe95
Show file tree

Hide file tree

Showing 57 changed files with 2,030 additions and 1,894 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [1.0.0] - 2021-03-05
+## [1.0.0] - 2021-04-20
 ### Fixed
 - fix SCT-2930: not honoring withPublic and withPrivate
 - fix SCT-2931: maximum reported search results was 10,000 - fixed to report actual search results with > 10K.
@@ -16,9 +16,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - fix SCT-2947: unauthenticated search not working correctly (no ws or narr info)
 - fix SCT-2969: not honoring object_types
 - fix SCT-2970: not narrowing search with additional terms
+- fix SCT-3001: Data-search ui  / Searchapi2 legacy endpoint generates incorrect object landing page links
+- fix SCT-3002: Searchpi2 legacy endpoint returns incorrect index and index version
 
 ### Added
-- implement SCT-2966: add a "build and test" workflow for GitHub Actions which builds an image, runs tests, and pushes the resulting image too GH Container Registry.
+- implement SCT-2966: add a "build and test" workflow for GitHub Actions which builds an image, runs tests, and pushes the resulting image to GH Container Registry.
 
 ## [0.4.9] - 2020-09-11
 ### Changed

diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 test:
 	sh scripts/run_tests
 
-build-integration-test-images:
+build-dev-images:
 	@echo Building integration test images...
 	sh scripts/build-integration-test-images.sh
 	@echo Integration test images built

diff --git a/README.md b/README.md
@@ -31,6 +31,9 @@ The [search configuration file](https://github.com/kbase/index_runner_spec/blob/
 * `-32003` - Elasticsearch response error
 * `-32004` - User profile service response error
 * `-32005` - Unknown workspace type
+* `-32006` - Access group missing
+* `-32007` - User profile missing
+
 
 ### `<url>/rpc`
 
@@ -80,7 +83,7 @@ Show the names of all indexes, and show what aliases stand for what indexes.
 
 ### <url>/legacy
 
-A JSON-RPC 2.0 API that mimics the legacy Java server, [found here](https://github.com/kbase/KBaseSearchEngin://github.com/kbase/KBaseSearchEngine). Refer to the `src/search1_rpc/schemas` file for a reference on the method parameter types.
+A JSON-RPC 1.1 API that mimics the legacy Java server, [found here](https://github.com/kbase/KBaseSearchEngin://github.com/kbase/KBaseSearchEngine). Refer to the `src/search1_rpc/schemas` file for a reference on the method parameter types.
 
 ## Development
 

diff --git a/docs/ez-guide.md b/docs/ez-guide.md
@@ -4,6 +4,16 @@ This is a set of tasks which worked well for me on macOS.
 
 ## Unit Testing
 
+### Host dependencies
+
+You will need to have, at a minimum:
+
+- make
+- python 3.7
+- docker
+
+### Set up python
+
 Install virtual environment for python:
 
 ```sh
@@ -24,6 +34,8 @@ Unit tests are run locally, so need to install all python dependencies:
 poetry install
 ```
 
+### Run Tests
+
 > TODO: should be able to run unit tests in a container, to avoid the need for any host-level installs.
 
 Run the tests!
@@ -34,6 +46,12 @@ This will run all the unit tests plus associated code quality evaluations.
 make test
 ```
 
+or for coverage
+
+```sh
+make test-coverage
+```
+
 > Note: Ensure that https://ci.kbase.us is accessible from the host machine; some unit tests require this (and should not be unit tests!)
 
 To run tests in a given directory or individual test modules:
@@ -55,6 +73,8 @@ See [Integration Testing](integration-testing.md)
 
 ## Using with kbase-ui
 
+This workflow is very handy for working on both the search api back end and search tool front ends.
+
 ```
 IP="<IP HERE>" SSHHOST="login1.berkeley.kbase.us" SSHUSER="<KBASE DEV USERNAME>" SSHPASS="<KBASE DEV PWD>" make run-dev-server
 ```

diff --git a/src/es_client/query.py b/src/es_client/query.py
@@ -25,13 +25,9 @@ def search(params, meta):
     # The query object, which we build up in steps below
     query = {'bool': {}}  # type: dict
 
-    # if not params.get('only_public') and meta['auth']:
-
-    # Fetch the workspace IDs that the user can read
-    # Used for access control and also to ensure that
-    # workspaces which are inaccessible in the workspace,
-    # but that fact is not yet updated in search, are
-    # still filtered out.
+    # Fetch the workspace IDs that the user can read.
+    # Used for access control and also to ensure that workspaces which are
+    # inaccessible, but have not yet been updated in search, are still filtered out.
     authorized_ws_ids = ws_auth(
         meta['auth'],
         params.get('only_public', False),
@@ -52,6 +48,11 @@ def search(params, meta):
     # Make a query request to elasticsearch
     url = config['elasticsearch_url'] + '/' + index_name_str + '/_search'
 
+    # TODO: address the performance settings below:
+    # - 3m for timeout is seems excessive, and many other elements of the
+    #   search process may have 1m timeouts; perhaps default to a lower limit, and
+    #   allow a parameter to set the timeout to an arbitrary value
+    # - the "allow_expensive_queries" setting has been disabled, why?
     options = {
         'query': query,
         'size': 0 if params.get('count') else params.get('size', 10),

diff --git a/src/exceptions.py b/src/exceptions.py
@@ -1,30 +1,37 @@
 from src.utils.obj_utils import get_path
 
 
+# TODO: we should use the jsonrpc11base libraries base exception for
+# server errors
 class ResponseError(Exception):
 
     def __init__(self, code=-32000, message='Server error'):
         self.message = message
         self.jsonrpc_code = code
 
 
-class UnknownType(ResponseError):
+class AuthError(ResponseError):
 
-    def __init__(self, message) -> object:
-        super().__init__(code=-32005, message=message)
+    def __init__(self, auth_resp_json, resp_text):
+        # Extract the error message from the auth service's RPC response
+        msg = get_path(auth_resp_json, ['error', 'message'])
+        if msg is None:
+            # Fall back to the full response body
+            msg = resp_text
+        super().__init__(code=-32001, message=msg)
 
 
-class ElasticsearchError(ResponseError):
+class UnknownIndex(ResponseError):
 
     def __init__(self, message):
-        msg = f"Elasticsearch request error:\n{message}"
-        super().__init__(code=-32003, message=msg)
+        super().__init__(code=-32002, message=message)
 
 
-class UnknownIndex(ResponseError):
+class ElasticsearchError(ResponseError):
 
     def __init__(self, message):
-        super().__init__(code=-32002, message=message)
+        msg = f"Elasticsearch request error:\n{message}"
+        super().__init__(code=-32003, message=msg)
 
 
 class UserProfileError(ResponseError):
@@ -36,12 +43,28 @@ def __init__(self, url, resp_text):
         super().__init__(code=-32004, message=msg)
 
 
-class AuthError(ResponseError):
+class UnknownType(ResponseError):
 
-    def __init__(self, resp_json, resp_text):
-        # Extract the error message from the RPC response
-        msg = get_path(resp_json, ['error', 'message'])
-        if msg is None:
-            # Fall back to the full response body
-            msg = resp_text
-        super().__init__(code=-32001, message=msg)
+    def __init__(self, message) -> object:
+        super().__init__(code=-32005, message=message)
+
+
+class NoAccessGroupError(ResponseError):
+    """
+    Raised when a search result does not contain an "access_group"
+    key, which should be impossible.
+    """
+
+    def __init__(self):
+        message = 'A search document does not contain an access group'
+        super().__init__(code=-32006, message=message)
+
+
+class NoUserProfileError(ResponseError):
+    """
+    Raised when a username does not have an associated user profile.
+    """
+
+    def __init__(self, username):
+        message = f'A user profile could not be found for "{username}"'
+        super().__init__(code=-32007, message=message)
diff --git a/src/search1_conversion/convert_params.py b/src/search1_conversion/convert_params.py
@@ -33,15 +33,10 @@
           entirety. Longer fields are shown as snippets preceded or followed by
           "...".
 """
-from src.utils.config import config
+
 from src.utils.obj_utils import get_any
 from jsonrpc11base.errors import InvalidParamsError
 
-# Unversioned feature index name/alias, (eg "genome_features")
-_FEATURES_UNVERSIONED = config['global']['genome_features_current_index_name']
-# Versioned feature index name (eg "genome_features_2")
-_GENOME_FEATURES_IDX_NAME = config['global']['latest_versions'][_FEATURES_UNVERSIONED]
-
 # Mapping of special sorting properties names from the Java API to search2 key names
 _SORT_PROP_MAPPING = {
     'scientific_name': 'scientific_name.raw',
@@ -99,7 +94,6 @@ def search_types(params):
             with_all_history - ignored
     output:
         type_to_count - dict where keys are type names and vals are counts
-        search_time - int - total time performing search
     This method constructs the same search parameters as `search_objects`, but
     aggregates results based on `obj_type_name`.
     """
@@ -119,17 +113,12 @@ def get_objects(params):
     Convert params from the "get_objects" RPC method into an Elasticsearch query.
     Retrieve a list of objects based on their upas.
     params:
-        guids - list of string - KBase IDs (upas) to fetch
-        post_processing - object of post-query filters (see PostProcessing def at top of this module)
+        ids - list of string - Search document ids to fetch; ids are in a specific
+        format for object indexes: "WS::<wsid>:<objid>"
     output:
-        objects - list of ObjectData - see the ObjectData type description in the module docstring above.
-        search_time - int - time it took to perform the search on ES
-        access_group_narrative_info - dict of {access_group_id: narrative_info} -
-            Information about the workspaces in which the objects in the
-            results reside. This data only applies to workspace objects.
+        query - elasticsearch query for document ids specified in the params argument
     """
-    query = {'query': {'terms': {'_id': params['guids']}}}
-    return query
+    return {'query': {'terms': {'_id': params['ids']}}}
 
 
 def _get_search_params(params):
@@ -179,10 +168,13 @@ def _get_search_params(params):
                 }
             })
         else:
-            raise InvalidParamsError(message="Invalid timestamp range in match_filter/timestamp")
+            raise InvalidParamsError(
+                message="Invalid timestamp range in match_filter/timestamp")
 
-    # Handle a search on tags, which corresponds to the generic `tags` field in all indexes.
-    # search_tags is populated on a workspace to indicate the type of workspace. Currently
+    # Handle a search on tags, which corresponds to the generic `tags` field in all
+    # indexes.
+    # search_tags is populated on a workspace to indicate the type of workspace.
+    # Currently
     # supported are "narrative", "refseq", and "noindex"
     if match_filter.get('source_tags'):
         # If source_tags_blacklist is `1`, then we are **excluding** these tags.
@@ -202,11 +194,9 @@ def _get_search_params(params):
     object_types = params.get('object_types', [])
     if object_types:
         # For this fake type, we search on the specific index instead (see lower down).
-        type_blacklist = ['GenomeFeature']
         query['bool']['filter']['bool']['should'] = [
             {'term': {'obj_type_name': obj_type}}
             for obj_type in object_types
-            if obj_type not in type_blacklist
         ]
 
     # Translate with_private and with_public to only_private and only_public.
@@ -238,9 +228,9 @@ def _get_search_params(params):
     # Handle sorting options
     if 'sorting_rules' not in params:
         params['sorting_rules'] = [{
-          "property": "timestamp",
-          "is_object_property": 0,
-          "ascending": 1
+            "property": "timestamp",
+            "is_object_property": 0,
+            "ascending": 1
         }]
     sort = []  # type: list
     for sort_rule in params['sorting_rules']:
@@ -280,20 +270,19 @@ def _get_search_params(params):
         'track_total_hits': True
     }
 
-    if 'GenomeFeature' in object_types:
-        search_params['indexes'] = [_GENOME_FEATURES_IDX_NAME]
-
     return search_params
 
 
 def _handle_lookup_in_keys(match_filter, query):
     """
     Handle the match_filter/lookup_in_keys option from the legacy API.
-    This allows the user to pass a number of field names and term or range values for filtering.
+    This allows the user to pass a number of field names and term or range values for
+    filtering.
     """
     if not match_filter.get('lookup_in_keys'):
         return query
-    # This will be a dict where each key is a field name and each val is a MatchValue type
+    # This will be a dict where each key is a field name and each val is a MatchValue
+    # type
     lookup_in_keys = match_filter['lookup_in_keys']
     for (key, match_value) in lookup_in_keys.items():
         # match_value will be a dict with one of these keys set: