Merge branch 'release/0.3.1'

cverluise · Dec 23, 2020 · ff4ce31 · ff4ce31
2 parents 8193177 + 084c5eb
commit ff4ce31
Show file tree

Hide file tree

Showing 18 changed files with 378 additions and 162 deletions.
diff --git a/README.md b/README.md
@@ -2,13 +2,13 @@
 </h1>
 
 <p align="center">
-<img src="https://img.shields.io/badge/release-0.3.0-yellow">
+<img src="https://img.shields.io/badge/release-0.3.1-yellow">
 <a href="https://cverluise.github.io/PatCit/">
 <img alt="Documentation" src="https://img.shields.io/badge/website-online-brightgreen">
 <img src="https://img.shields.io/badge/code-MIT-green">
 <img src="https://img.shields.io/badge/data-CC%20BY%204.0-blue">
-<a href="https://doi.org/10.5281/zenodo.4244176">
-<img src="https://img.shields.io/badge/zenodo-0.3.0-darkblue" alt="DOI"></a>
+<a href="https://doi.org/10.5281/zenodo.3710993">
+<img src="https://img.shields.io/badge/zenodo-0.3.1-darkblue" alt="DOI"></a>
 <img src="https://img.shields.io/badge/models-dvc-purple">
 </p>
 
@@ -92,7 +92,7 @@ Patents|<p align="center">✅</p>|<p align="center">✅</p>|<p align="center">
 
 ## FAIR
 
-[patcit-zenodo]:https://zenodo.org/record/4244176
+[patcit-zenodo]:https://doi.org/10.5281/zenodo.3710993
 [bq-quickstart]:https://cloud.google.com/bigquery/docs/quickstarts/quickstart-web-ui
 
 📍 **Find** - The patCit dataset is available on [BigQuery][patcit-bq] in an interactive environment. For those who have a smattering of SQL, this is the perfect place to explore the data. It can also be downloaded on [Zenodo][patcit-zenodo].

diff --git a/data/.gitignore b/data/.gitignore
@@ -24,3 +24,8 @@
 /ent_std_test.json
 /ent_std_train.json
 /ent_std_xx.jsonl
+/val_detect_patent_falsenegatives.jsonl
+/val_detect_patent_falsepositives.jsonl
+/val_detect_patent_pred_goldsample.jsonl
+/val_parsing_patent_kindcode_gold.jsonl
+/val_parsing_patent_kindcodenonnull_gold.jsonl
diff --git a/data/val_detect_patent_falsenegatives.jsonl.dvc b/data/val_detect_patent_falsenegatives.jsonl.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: 107424dbe4e15f64f85d419d6ceb7c3c
+  path: val_detect_patent_falsenegatives.jsonl
diff --git a/data/val_detect_patent_falsepositives.jsonl.dvc b/data/val_detect_patent_falsepositives.jsonl.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: f2c27fba5a01de3fb8483462f97ac5b5
+  path: val_detect_patent_falsepositives.jsonl
diff --git a/data/val_detect_patent_pred_goldsample.jsonl.dvc b/data/val_detect_patent_pred_goldsample.jsonl.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: f72f2fff2314fa1d2af4e19cf847a417
+  path: val_detect_patent_pred_goldsample.jsonl
diff --git a/data/val_parsing_patent_kindcode_gold.jsonl.dvc b/data/val_parsing_patent_kindcode_gold.jsonl.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: aa71efdc6ee85a3ce354a431b2de83d0
+  path: val_parsing_patent_kindcode_gold.jsonl
diff --git a/data/val_parsing_patent_kindcodenonnull_gold.jsonl.dvc b/data/val_parsing_patent_kindcodenonnull_gold.jsonl.dvc
@@ -0,0 +1,3 @@
+outs:
+- md5: 9ea70bf3e8665ba9dff06c973da8fc28
+  path: val_parsing_patent_kindcodenonnull_gold.jsonl
diff --git a/docs/download.md b/docs/download.md
@@ -1,5 +1,5 @@
 [gs-quickstart]:https://cloud.google.com/storage/docs/quickstarts-console
-[zen-patcit]:https://zenodo.org/record/4244176
+[zen-patcit]:https://doi.org/10.5281/zenodo.3710993
 [^bq-save]:You can save small tables (less than 16,000 rows) to clipboard, locally or to Google sheets. You can save mid-size (less than 1Gb) tables to Google Drive. Larger tables have to be saved to BigQuery, then to Google Cloud Storage and from there you can download them locally.
 
 

diff --git a/docs/index.md b/docs/index.md
@@ -5,6 +5,7 @@
 [patcit-academic]:https://docs.google.com/presentation/d/11COlz64EZn8PipXvnDBBZI_bnDD0fpm6tyx1_EqD6lU/edit?usp=sharing
 [patcit-website]:https://cverluise.github.io/PatCit/
 [patcit-newsletter]:https://tinyletter.com/patcit
+[zen-patcit]:https://doi.org/10.5281/zenodo.3710993
 
 [^1]: Front page NPL citations contain bibliographical references, office actions, search reports, patents, webpages, wikis, norms & standards, product documentations, databases and litigations. Patent text notably contain citations of patents, NPL, software, databases and products.
 
@@ -21,7 +22,9 @@ It is now common to use front-page *patent* citations to study some aspects of t
 
 ### Getting started
 
-👩‍🔬 Exploring the universe of patent citations has never been easier. No more complicated data set-up, memory issue and queries running for ever, we host [patCit on BigQuery][patcit-bq] for you. We also release public access to [quickstart notebooks](../notebook).
+🛢️ Exploring the universe of patent citations has never been easier. No more complicated data set-up, memory issue and queries running for ever, we host [patCit on BigQuery][patcit-bq] for you. You can also download it on [Zenodo][zen-patcit].
+
+👩‍🔬 Time to play! We give public access to [quickstart notebooks](../notebook).
 
 🤗 patCit is community driven and benefits from the suppport of a reactive team who is eager happy to help and tackle your next request. This is where academics and industry practitioners meet.
 

diff --git a/docs/recipes.md b/docs/recipes.md
@@ -148,27 +148,51 @@ QUERY=$(python patcit/main.py bq front-page-cat --meta npl-parsing.external.v03_
 
 ### Patent
 
+- serialize
 
-- Add `publication_number`
+```shell script
+ls *.csv.gz | cut -d_ -f4- | cut -d. -f1 | parallel -j +0 --eta "patcit serialize grobid-intext processed_us_description_{}.csv.gz --flavor pat >> pat_serialized_{}.jsonl"
+```
 
-- Add patent properties
+- build `publication_number` index
 
-- Make sure it is ordered along the primary key
+```shell script
+ls *.jsonl | parallel -j+0 --eta "jq -s -c '{status: .status, pubnum: .pubnum}' >> tmp_pubnum_index.jsonl"
+sort pubnum_index.jsonl | uniq >> pubnum_index.jsonl
+```
 
-- Extract table to gs
-    ```sql
-    SELECT
-      DISTINCT(CONCAT(orgname,original)) AS pubnum
-    FROM
-      `npl-parsing.patcit.v01_UScontextualPat`
-    ```
+- add `publication_number` (BQ)
+
+```shell script
+patcit serialize add-pubnum pubnum_index.jsonl >> publication_number_index.jsonl
+```
+
+- add patent properties (BQ)
+
+```shell script
+bq load ... publication_number_index.jsonl
+bq load ... pat_serialize*.jsonl
+bq query ...
+```
+
+!!! warning
+    Make sure it is ordered along the primary key
+
+- extract to gs
+
+````shell script
+bq extract
+````
+
+!!! warning
+    Make sure that there is no primary key overlap between files
 
 - prep table
 
-```bash
-ls intext_patent_flat_0000000000*.jsonl | parallel -j +0 --eta "sort {} | uniq >> distinct_{}"
-ls distinct_intext_patent_flat_0000000000*.jsonl | parallel -j +0 --eta "jq -s -c 'group_by(.publication_number_o)[] | {publication_number: .[0].publication_number_o, publication_date: .[0].publication_date_o, appln_id: .[0].appln_id_o, pat_publn_id: .[0].pat_publn_id_o, docdb_family_id: .[0].docdb_family_id_o, inpadoc_family_id: .[0].inpadoc_family_id_o, citation: [ .[] | {country_code: .orgname, original_number: .original, publication_number: .publication_number, publication_date: .publication_date, appln_id: .appln_id, pat_publn_id: .pat_publn_id, docdb_family_id: .docdb_family_id, inpadoc_family_id: .inpadoc_family_id} ]}' {} >> $(sed -e 's/_flat//g') && gzip $(sed -e 's/_flat//g')"
-bq load --source_format=NEWLINE_DELIMITED_JSON --max_bad_records=100 --ignore_unknown_values --replace patcit-public-data:intext.patent "gs://patcit_dev/intext/intext_patent*.jsonl.gz" schema/intext_patent.json
+```shell script
+ls flat_v031_intext_patent_* | cut -d_ -f2- | parallel -j 1 --eta "jq -s -c 'group_by(.publication_number_o)[] | {publication_number: .[0].publication_number_o, publication_date: .[0].publication_date_o, appln_id: .[0].appln_id_o, pat_publn_id: .[0].pat_publn_id_o, docdb_family_id: .[0].docdb_family_id_o, inpadoc_family_id: .[0].inpadoc_family_id_o, citation: [ group_by(.pubnum)[] | {country_code: .[0].orgname, original_number: .[0].original, kind_code: .[0].kindcode, type: .[0].type, status: .[0].status, pubnum: .[0].pubnum, epodoc: .[0].epodoc, publication_number: .[0].publication_number, publication_date: .[0].publication_date, appln_id: .[0].appln_id, pat_publn_id: .[0].pat_publn_id, docdb_family_id: .[0].docdb_family_id, inpadoc_family_id: .[0].inpadoc_family_id, char_start: [ .[] | .char_start], char_end: [ .[] | .char_end], service: .[0].service} ]}' flat_{} >> {} "
+ls v031_*.jsonl | parallel -j+0 --eta "mv {} {}_tmp &&  sed 's/\[null\]/\[\]/g' {}_tmp >> {}"
+bq load --source_format=NEWLINE_DELIMITED_JSON --max_bad_records=1000 --ignore_unknown_values --replace npl-parsing:external.v031_intext_patent "gs://patcit_dev/intext/v031_*.jsonl.gz" schema/intext_patent.json
 ```
 
 

diff --git a/material-dev/prodigy/parsing_patent_recipe.py b/material-dev/prodigy/parsing_patent_recipe.py
@@ -33,7 +33,7 @@ def add_html(stream):
 
             task["html"] = (
                 f"<span style='background-color:#775ec2;color:white;font-size:130%;font-weight:bold;'>  "
-                f"{span[attr]}  </span><br> \
+                f"{str(span.get(attr))}  </span><br> \
                            {before} <span style='background-color: #fae284'><a \
                            href={root + suffix}>{span_}</a></span> \
                            {after}"

diff --git a/patcit/bq/main.py b/patcit/bq/main.py
@@ -263,46 +263,38 @@ def front_page_cat(meta: str = None, cat: str = None, text_name: str = "npl_bibl
     typer.echo(query)
 
 
-def intext_patent_add_publication_number(
-    grobid_intext_patent: str = None, pubnum2publication_number: str = None
-):
-    """Query to add publication_number to grobid_intext_patent based on CONCAT(orgname, original) (
-    aka pubnum)"""
-    query = f"""
-    WITH
-      tmp AS(
-      SELECT
-        *,
-        CONCAT(orgname, original) AS pubnum
-      FROM
-        `{grobid_intext_patent}`)  # npl-parsing.patcit.v01_UScontextualPat
-    SELECT
-      tmp.* EXCEPT(status, epodoc),
-      crossover.* EXCEPT(pubnum)
-    FROM
-      tmp
-    LEFT JOIN
-      `{pubnum2publication_number}` AS crossover  # npl-parsing.external.v03_intext_pat_pubnum
-    ON
-      tmp.pubnum=crossover.pubnum"""
-    return query
-
-
+@app.command()
 def intext_patent(
-    grobid_intext_patent: str = None, patstat_patent_properties: str = None
+    grobid_intext_patent: str = None,
+    patstat_patent_properties: str = None,
+    index_publication_number: str = None,
 ):
     """Return query to create the intext_patent table"""
     query = f"""
     WITH
       tmp AS (
+      WITH
+        intext_patent AS (
+        SELECT
+          patent.*,
+          index.* EXCEPT(pubnum,
+            status) #,
+          #"google_patents" AS linking_service
+        FROM
+          `{grobid_intext_patent}` AS patent  # npl-parsing.external.v03_intext_patent
+        LEFT JOIN
+          `{index_publication_number}` AS index
+          #npl-parsing.external.v03_index_pat_publication_number
+        ON
+          patent.pubnum = index.pubnum)
       SELECT
         intext_patent.*,
         patent_properties.* EXCEPT(publication_number)
       FROM
-        `{grobid_intext_patent}` AS intext_patent  # npl-parsing.external.v03_intext_pat
+        intext_patent  # npl-parsing.external.v03_intext_pat
       LEFT JOIN
         `{patstat_patent_properties}` AS patent_properties
-        # npl-parsing.external.patstat_patent_properties
+        #npl-parsing.external.patstat_patent_properties
       ON
         intext_patent.publication_number =patent_properties.publication_number)
     SELECT
@@ -315,11 +307,32 @@ def intext_patent(
     FROM
       tmp
     LEFT JOIN
-      `npl-parsing.external.patstat_patent_properties` AS patent_properties
+      `{patstat_patent_properties}` AS patent_properties
+      #npl-parsing.external.patstat_patent_properties
     ON
       tmp.publication_number_o= patent_properties.publication_number
     """
-    return query
+    typer.echo(query)
+
+
+@app.command(deprecated=True)
+def update_publication_number(intext_patent: str = None):
+    """Update publication_number with grobid parsed pubnum when no match and all 3 attributes
+    matched. Not used because that would mix docdb publication number with unknown formtat"""
+    query = f"""
+    UPDATE
+    `{intext_patent}`
+    #npl-parsing.external.v031r_intext_patent
+    SET
+      publication_number=pubnum,
+      linking_service='grobid'
+    WHERE
+      publication_number IS NULL
+      AND orgname IS NOT NULL
+      AND original IS NOT NULL
+      AND kindcode IS NOT NULL
+    """
+    typer.echo(query)
 
 
 if __name__ == "__main__":

diff --git a/patcit/data/main.py b/patcit/data/main.py
@@ -177,22 +177,14 @@ def prep_spacy_sam_patents(texts_file: str = None, citations_file: str = None):
     def prep_citations_spans(citations_file):
         with open(citations_file, "r") as fin:
 
-            def get_text_span(patent):
-                # TODO -> move to contextual_citation.fetch_patent
-                span = patent.find("ptr")["target"].replace("#string-range", "")
-                _, start, length = span.split(",")
-                length = length.replace(")", "")
-                start, end = (int(start), int(start) + int(length))
-                return start, end
-
             reader = csv.DictReader(fin, fieldnames=["publication_number", "citations"])
             out = {}
             for l in reader:
                 soup = BeautifulSoup(l["citations"], features="lxml")
                 patents = soup.find_all("biblstruct", {"type": "patent"})
                 spans = []
                 for patent in patents:
-                    start, end = get_text_span(patent)
+                    start, end = intext.get_text_span(patent)
                     span = asyncio.run(
                         intext.fetch_patent(l["publication_number"], patent)
                     )
@@ -331,6 +323,33 @@ def bibref_silver_to_gold(file: str, model: str = None):
             typer.echo(json.dumps(line))
 
 
+@app.command()
+def join_text_cit(texts_file: str, citations_file: str):
+    """Return citation lines (jsonl) with the appropriate text
+
+    Expect:
+    - a csv file with publication_number,text fields
+    - a jsonl file with "publication_number", "spans", etc fields
+
+    Useful for populate val_detect_patent*.jsonl with text in view of contextualizing citations
+    """
+    with open(citations_file, "r") as lines:
+        citations = {}
+        for line in lines:
+            line = json.loads(line)
+            citations.update({line["publication_number"]: line})
+
+    with open(texts_file, "r") as fin:
+        reader = csv.DictReader(fin, fieldnames=["publication_number", "text"])
+        texts = {}
+        for line in reader:
+            texts.update({line["publication_number"]: line})
+
+    for k, v in citations.items():
+        v.update(texts[k])
+        typer.echo(json.dumps(v))
+
+
 @app.command()
 def prep_spacy_sam(
     texts_file: str = None, citations_file: str = None, flavor: str = "patents"
@@ -397,7 +416,7 @@ def remove_space(start, end, text):
     return spans
 
 
-def contextualize_spans_(sam, nlp, context_window=10, attr=None):
+def contextualize_spans_(sam, nlp, context_window=10, attr=None, report=False):
     tmp = sam.copy()
     text = tmp["text"]
     tokens = nlp(tmp["text"]).to_json()["tokens"]
@@ -427,9 +446,16 @@ def contextualize_spans_(sam, nlp, context_window=10, attr=None):
         assert span_["end"] in list(map(int, spacy_ends - context_start))
 
         out.update({"spans": [span_]})
+
         if attr:
             out.update({"label": span_[attr]})
-        typer.echo(json.dumps(out))
+        if report:
+            text_, start_, end_ = out["text"], span_["start"], span_["end"]
+            typer.echo(
+                text_[:start_] + "`" + text_[start_:end_] + f" TAG`" + text_[end_:]
+            )
+        else:
+            typer.echo(json.dumps(out))
 
 
 @app.command()
@@ -486,7 +512,9 @@ def report_alignment(file: str, context_window: int = 10):
 
 
 @app.command()
-def contextualize_spans(file: str, model: str = "en", attr: str = None):
+def contextualize_spans(
+    file: str, model: str = "en", attr: str = None, report: bool = False
+):
     """Contextualize spans
 
     Expect jsonl with Simple Annotation Model lines
@@ -499,7 +527,7 @@ def contextualize_spans(file: str, model: str = "en", attr: str = None):
         for line in fin:
             sam = json.loads(line)
             try:
-                contextualize_spans_(sam, nlp, attr=attr)
+                contextualize_spans_(sam, nlp, attr=attr, report=report)
             except IndexError:  # arises when the window exceeds the size of the doc
                 # E.g. IndexError: index 1191 is out of bounds for axis 0 with size 1191
                 pass
@@ -575,5 +603,22 @@ def keep_lang(texts, language_codes):
     typer.echo(json.dumps(docs_json, indent=1, sort_keys=True))
 
 
+@app.command()
+def filter_validation(file, index):
+    """Return the lines from FILE with publication number in index
+
+    index is expected to be a newline delimited list of publication numbers"""
+    index_lines = open(index, "r")
+    index_ = index_lines.read().split("\n")
+
+    with open(file, "r") as lines:
+        for line in lines:
+            line = json.loads(line)
+            if line.get("publication_number") in index_:
+                typer.echo(json.dumps(line))
+            else:
+                pass
+
+
 if __name__ == "__main__":
     app()