Merge pull request #410 from GateNLP/dev

Release 2.2.0
GateNLP · May 8, 2024 · 8b54bc1 · 8b54bc1
2 parents 57946de + fe1790d
commit 8b54bc1
Show file tree

Hide file tree

Showing 69 changed files with 9,042 additions and 828 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,43 @@ docker compose run --rm -it pgbackups /backup.sh
 
 (or `docker-compose` if your version of Docker does not support compose v2).
 
+## [2.2.0] 2024-05-08
+
+### Changed
+- **Breaking change**: When exporting annotations as JSON, the "features" that the annotator entered are no longer nested under `label` ([#347](https://github.com/GateNLP/gate-teamware/issues/347)).  Where previously the export would have been 
+  ```json
+  {
+    "features": {
+      "label": {
+        "field1": "value1"
+      }
+    }
+  }
+  ```
+
+  it is now
+  ```json
+  {
+    "features": {
+      "field1": "value1"
+    }
+  }
+  ```
+- Include details of failed annotations in export formats ([#399](https://github.com/GateNLP/gate-teamware/pull/399))
+  - When exporting annotation data from projects (both via the web UI and using the command line tool),
+    each document includes details of which users _rejected_, _timed out_ or _aborted_ annotation of
+    that document, as well as the annotation data from the users who completed the document successfully.
+    This can be useful for the project manager to identify documents that are particularly difficult
+    to annotate, perhaps suggesting that the annotation guidelines need to be extended or clarified.
+
+### Fixed
+- Upgraded a number of third-party dependencies to close various vulnerabilities ([#397](https://github.com/GateNLP/gate-teamware/pull/397))
+- Fixed several issues relating to the export of annotated data ([#377](https://github.com/GateNLP/gate-teamware/pull/377))
+  - "Anonymous" export was not properly anonymous ([#345](https://github.com/GateNLP/gate-teamware/issues/345))
+  - Teamware now does a better job of preserving the GATE BDOC JSON structure when exporting documents that were originally uploaded in that format ([#346](https://github.com/GateNLP/gate-teamware/issues/346), [#348](https://github.com/GateNLP/gate-teamware/issues/348))
+- Added an explicit setting for "no email security", as an alternative to the implicit setting when the relevant environment variable is omitted.  This is because the implicit setting was lost on upgrades, whereas an explicit "none" will be preserved ([#402](https://github.com/GateNLP/gate-teamware/pull/402))
+
+
 ## [2.1.1] 2023-10-02
 
 ### Added 

diff --git a/CITATION.cff b/CITATION.cff
@@ -1,6 +1,6 @@
 abstract: A web application for collaborative document annotation. GATE teamware provides
   a flexible web app platform for managing classification of documents by human annotators.
-authors: 
+authors:
 - affiliation: The University of Sheffield
   email: [email protected]
   family-names: Karmakharm
@@ -33,13 +33,7 @@ keywords:
 - document annotation
 license: AGPL-3.0
 message: If you use this software, please cite it using the metadata from this file.
-repository-code: https://github.com/GateNLP/gate-teamware
-title: GATE Teamware
-type: software
-url: https://gatenlp.github.io/gate-teamware/
-version: 2.1.1
 preferred-citation:
-  type: conference-paper
   authors:
   - affiliation: The University of Sheffield
     email: [email protected]
@@ -66,14 +60,22 @@ preferred-citation:
     family-names: Bontcheva
     given-names: Kalina
     orcid: https://orcid.org/0000-0001-6152-9600
+  collection-title: 'Proceedings of the 17th Conference of the European Chapter of
+    the Association for Computational Linguistics: System Demonstrations'
   doi: 10.18653/v1/2023.eacl-demo.17
-  title: "GATE Teamware 2: An open-source tool for collaborative document classification annotation"
-  collection-title: "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations"    
+  end: 151
   location:
     name: Dubrovnik, Croatia
-  year: 2023
   month: 5
-  start: 145
-  end: 151
   publisher:
     name: Association for Computational Linguistics
+  start: 145
+  title: 'GATE Teamware 2: An open-source tool for collaborative document classification
+    annotation'
+  type: conference-paper
+  year: 2023
+repository-code: https://github.com/GateNLP/gate-teamware
+title: GATE Teamware
+type: software
+url: https://gatenlp.github.io/gate-teamware/
+version: 2.2.0
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.1.1
+2.2.0
diff --git a/backend/models.py b/backend/models.py
@@ -978,25 +978,28 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
         # Create dictionary for document
         doc_dict = None
         if json_format == "raw" or json_format == "csv":
-            doc_dict = self.data
+            doc_dict = self.data.copy()
         elif json_format == "gate":
+            # GATE json format are expected to have an existing "features" field
+            features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {}
 
-            ignore_keys = {"text", self.project.document_id_field}
-            features_dict = {key: value for key, value in self.data.items() if key not in ignore_keys}
+            # Add any non-compliant top-level fields into the "features" field instead
+            ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field}
+            features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys})
 
             doc_dict = {
                 "text": self.data["text"],
                 "features": features_dict,
-                "offset_type": "p",
+                "offset_type": self.data["offset_type"] if "offset_type" in self.data else "p",  # Use original offset type
                 "name": get_value_from_key_path(self.data, self.project.document_id_field)
             }
-            pass
 
         # Insert annotation sets into the doc dict
         annotations = self.annotations.filter(status=Annotation.COMPLETED)
         if json_format == "csv":
+            # Gets pre-existing annotations
+            annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {}
             # Format annotations for CSV export
-            annotation_sets = {}
             for annotation in annotations:
                 a_data = annotation.data
                 annotation_dict = {}
@@ -1009,36 +1012,58 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                 annotation_dict["duration_seconds"] = annotation.time_to_complete
 
                 if anonymize:
-                    annotation_sets[str(annotation.user.id)] = annotation_dict
+                    annotation_sets[f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"] = annotation_dict
                 else:
                     annotation_sets[annotation.user.username] = annotation_dict
 
             doc_dict["annotations"] = annotation_sets
 
         else:
+            # Gets pre-existing annotations
+            annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {}
             # Format for JSON in line with GATE formatting
-            annotation_sets = {}
             for annotation in annotations:
                 a_data = annotation.data
+                anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"
                 annotation_set = {
-                    "name": annotation.user.id if anonymize else annotation.user.username,
+                    "name": anonymized_name if anonymize else annotation.user.username,
                     "annotations": [
                         {
                             "type": "Document",
                             "start": 0,
                             "end": 0,
                             "id": 0,
                             "duration_seconds": annotation.time_to_complete,
-                            "features": {
-                                "label": a_data
-                            }
+                            "features": a_data
                         }
                     ],
                     "next_annid": 1,
                 }
-                annotation_sets[annotation.user.username] = annotation_set
+                annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set
+
             doc_dict["annotation_sets"] = annotation_sets
 
+        # Add to the export the lists (possibly empty) of users who rejected,
+        # timed out or aborted annotation of this document
+        teamware_status = {}
+        for key, status in [
+            ("rejected_by", Annotation.REJECTED),
+            ("timed_out", Annotation.TIMED_OUT),
+            ("aborted", Annotation.ABORTED),
+        ]:
+            teamware_status[key] = [
+                f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" if anonymize else annotation.user.username
+                for annotation in self.annotations.filter(status=status)
+            ]
+            if json_format == "csv":
+                # Flatten list if exporting as CSV
+                teamware_status[key] = ",".join(str(val) for val in teamware_status[key])
+
+        if json_format == "gate":
+            doc_dict["features"]["teamware_status"] = teamware_status
+        else:
+            doc_dict["teamware_status"] = teamware_status
+
         return doc_dict
 
 

diff --git a/backend/rpc.py b/backend/rpc.py
@@ -510,7 +510,7 @@ def get_projects(request, current_page=1, page_size=None, filters=None):
     # Perform filtering
     if isinstance(filters, str):
         # Search project title if is filter is a string only
-        projects_query = Project.objects.filter(name__contains=filters.strip())
+        projects_query = Project.objects.filter(name__icontains=filters.strip())
         total_count = projects_query.count()
     else:
         projects_query = Project.objects.all()