Use 2019 shapefile, add USPC class, and fix bugs

n3ssuno · Feb 8, 2021 · 212f380 · 212f380
1 parent 4bf3c0b
commit 212f380
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 144 deletions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -0,0 +1 @@
+Carlo Bottai (Eindhoven University of Technology)
diff --git a/Makefile b/Makefile
@@ -23,6 +23,7 @@ DATA_DIR_INTM = $(DATA_DIR)/interim
 DATA_DIR_PROC = $(DATA_DIR)/processed
 
 DATA_DIR_USPTO = $(DATA_DIR_RAW)/patentsview
+DATA_DIR_PATEX = $(DATA_DIR_RAW)/patex
 DATA_DIR_SHP = $(DATA_DIR_RAW)/cartography
 
 SCRIPT_DIR = src
@@ -35,19 +36,26 @@ USPTO_URL = https://s3.amazonaws.com/data.patentsview.org/20200929/download
 USPTO_FILES = patent.tsv.zip application.tsv.zip patent_inventor.tsv.zip location.tsv.zip uspatentcitation.tsv.zip
 USPTO_TARGETS := $(foreach F,$(USPTO_FILES),$(DATA_DIR_USPTO)/$F)
 
-SHP_URL = https://www2.census.gov/geo/tiger/GENZ2018/shp
-SHP_FILES = cb_2018_us_cbsa_20m.zip
-SHP_TARGETS := $(foreach F,$(SHP_FILES),$(DATA_DIR_SHP)/$F)
-
 $(USPTO_TARGETS): $(DATA_DIR_USPTO)/%: $(SCRIPT_DIR)/download.py
 	python $< -i $(USPTO_URL)/$* -o $@
 
+PATEX_URL = https://bulkdata.uspto.gov/data/patent/pair/economics/2019
+PATEX_FILES = application_data.csv.zip
+PATEX_TARGETS := $(foreach F,$(PATEX_FILES),$(DATA_DIR_PATEX)/$F)
+
+$(PATEX_TARGETS): $(DATA_DIR_PATEX)/%: $(SCRIPT_DIR)/download.py
+	python $< -i $(PATEX_URL)/$* -o $@
+
+SHP_URL = https://www2.census.gov/geo/tiger/GENZ2019/shp
+SHP_FILES = cb_2019_us_cbsa_20m.zip
+SHP_TARGETS := $(foreach F,$(SHP_FILES),$(DATA_DIR_SHP)/$F)
+
 $(SHP_TARGETS): $(DATA_DIR_SHP)/%: $(SCRIPT_DIR)/download.py
 	python $< -i $(SHP_URL)/$* -o $@
 
 #################################################
 
-$(DATA_DIR_INTM)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-database.py $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/application.tsv.zip $(DATA_DIR_USPTO)/patent_inventor.tsv.zip $(DATA_DIR_USPTO)/location.tsv.zip $(DATA_DIR_SHP)/cb_2018_us_cbsa_20m.zip
+$(DATA_DIR_INTM)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-database.py $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/patent_inventor.tsv.zip $(DATA_DIR_USPTO)/location.tsv.zip $(DATA_DIR_SHP)/cb_2019_us_cbsa_20m.zip
 	python $< -I $(filter-out $<,$^) -o $@
 
 $(DATA_DIR_PROC)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-msa-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
@@ -56,15 +64,15 @@ $(DATA_DIR_PROC)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-msa-database.py $
 $(DATA_DIR_PROC)/msa_patent_inventor.tsv.zip: $(SCRIPT_DIR)/make-patent-inventor-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
 	python $< -i $(filter-out $<,$^) -o $@
 
-$(DATA_DIR_PROC)/msa_patent_info.tsv.zip: $(SCRIPT_DIR)/make-patent-info-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
-	python $< -i $(filter-out $<,$^) -o $@
-
 $(DATA_DIR_PROC)/msa_label.tsv.zip: $(SCRIPT_DIR)/make-msa-label-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
 	python $< -i $(filter-out $<,$^) -o $@
 
 $(DATA_DIR_PROC)/msa_citation.tsv.zip: $(SCRIPT_DIR)/make-citation-database.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_USPTO)/uspatentcitation.tsv.zip
 	python $< -I $(filter-out $<,$^) -o $@
 
+$(DATA_DIR_PROC)/msa_patent_info.tsv.zip: $(SCRIPT_DIR)/make-patent-info-database.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_PROC)/msa_citation.tsv.zip $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/application.tsv.zip $(DATA_DIR_PATEX)/application_data.csv.zip
+	python $< -I $(filter-out $<,$^) -o $@
+
 $(DOCS_DIR)/README_tables.md: $(SCRIPT_DIR)/make-readme-tables.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_PROC)/msa_patent_inventor.tsv.zip $(DATA_DIR_PROC)/msa_patent_info.tsv.zip $(DATA_DIR_PROC)/msa_label.tsv.zip $(DATA_DIR_PROC)/msa_citation.tsv.zip
 	python $< -I $(filter-out $<,$^) -o $@
 README.md: $(DOCS_DIR)/README_base.md $(DOCS_DIR)/README_tables.md

diff --git a/README.md b/README.md
@@ -1,6 +1,12 @@
 # Patenting in the US Metropolitan Areas
 This repository builds a database that collects information about the US patent applications developed by inventors located in Metropolitan Statistical Areas (MSA).
 
+The data are aggregated at the Core Based Statistical Area (CBSA) level, based on the localization (latitude and longitude) of each inventor, as provided by PatentsView. The boundaries of each CBSA are constant over time and based on the data provided by the US Census (version 2019). To each inventor, within a patent, is assigned a fraction of the patent proportional to the size of the "inventing team". As well, a fractional count of the inventors of each patent, located in a given metropolitan area, is provided.
+
+Moreover, for each patent (partly) invented in a metropolitan area, the forward citations received by the patent are provided.
+
+Lastly, of each of these patents (and citing patents), the application and publication dates, the number of claims, and the main USPC patent class are reported.
+
 ## Reproducibility
 To reproduce the database tables, please follow these steps:
 1. Install [Git](https://git-scm.com/)
@@ -35,7 +41,7 @@ Otherwise, please fork the repository, modify the code as you think is the best,
 ### Database
 The database is released under a [*CC-BY 4.0 License*](https://creativecommons.org/licenses/by/4.0/).
 
-The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/) and [US Census](https://www.census.gov/). You can find further references to the raw files used in the Makefile file.
+The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/), the [US Census](https://www.census.gov/), and the USPTO's [Patent Examination Research Dataset (PatEx)](https://www.uspto.gov/learning-and-resources/electronic-data-products/patent-examination-research-dataset-public-pair). You can find further references to the raw files used in the Makefile file.
 
 ## Folders structure
 ```
@@ -74,31 +80,35 @@ The following tables describe the database files, showing the first five rows of
 ### msa_patent_inventor
 |   patent_id | inventor_id   |   inventor_share |
 |-------------|---------------|------------------|
-|    10000000 | 5073021-1     |              1   |
-|    10000007 | 9473749-3     |              0.2 |
-|    10000007 | 9862137-3     |              0.2 |
-|    10000007 | 9862137-4     |              0.2 |
-|    10000007 | 9862137-5     |              0.2 |
+|    10000000 | 5073021-1     |         1        |
+|    10003756 | 10003756-2    |         0.5      |
+|    10003780 | 9495415-4     |         0.2      |
+|    10006993 | 5763054-3     |         1        |
+|    10007786 | 6067410-1     |         0.333333 |
 
 
 ### msa_patent_info
-|   patent_id | grant_date   | appln_date   |   num_claims |
-|-------------|--------------|--------------|--------------|
-|    10000000 | 2018-06-19   | 2015-03-10   |           20 |
-|    10000007 | 2018-06-19   | 2016-06-10   |           24 |
-|    10000008 | 2018-06-19   | 2014-12-01   |           11 |
-|    10000009 | 2018-06-19   | 2015-02-05   |           21 |
-|    10000010 | 2018-06-19   | 2016-06-29   |           20 |
+|   patent_id | grant_date   | appln_date   |   num_claims |   uspc_class |
+|-------------|--------------|--------------|--------------|--------------|
+|    10000000 | 2018-06-19   | 2015-03-10   |           20 |          356 |
+|    10000002 | 2018-06-19   | 2014-12-30   |            9 |          428 |
+|    10000003 | 2018-06-19   | 2013-03-12   |           18 |          156 |
+|    10000004 | 2018-06-19   | 2015-12-17   |            6 |          428 |
+|    10000005 | 2018-06-19   | 2012-08-03   |            4 |          156 |
+
+Notes:
+* Rename *patent_id* as *forward_citation_id* to merge this table with the *msa_citation* table.
+* 5.4% of the *patent_id*s have no *uspc_class* (most of which, very old or very recent patents).
 
 
 ### msa_label
-|   cbsa_id |   csa_id | cbsa_label                            |
-|-----------|----------|---------------------------------------|
-|     31080 |      348 | Los Angeles-Long Beach-Anaheim, CA    |
-|     33340 |      376 | Milwaukee-Waukesha-West Allis, WI     |
-|     35620 |      408 | New York-Newark-Jersey City, NY-NJ-PA |
-|     41860 |      488 | San Francisco-Oakland-Hayward, CA     |
-|     40380 |      464 | Rochester, NY                         |
+|   csa_id |   cbsa_id | cbsa_label                            |
+|----------|-----------|---------------------------------------|
+|      348 |     31080 | Los Angeles-Long Beach-Anaheim, CA    |
+|      376 |     33340 | Milwaukee-Waukesha, WI                |
+|      408 |     35620 | New York-Newark-Jersey City, NY-NJ-PA |
+|      488 |     41860 | San Francisco-Oakland-Berkeley, CA    |
+|      464 |     40380 | Rochester, NY                         |
 
 
 ### msa_citation

diff --git a/docs/README_base.md b/docs/README_base.md
@@ -1,6 +1,12 @@
 # Patenting in the US Metropolitan Areas
 This repository builds a database that collects information about the US patent applications developed by inventors located in Metropolitan Statistical Areas (MSA).
 
+The data are aggregated at the Core Based Statistical Area (CBSA) level, based on the localization (latitude and longitude) of each inventor, as provided by PatentsView. The boundaries of each CBSA are constant over time and based on the data provided by the US Census (version 2019). To each inventor, within a patent, is assigned a fraction of the patent proportional to the size of the "inventing team". As well, a fractional count of the inventors of each patent, located in a given metropolitan area, is provided.
+
+Moreover, for each patent (partly) invented in a metropolitan area, the forward citations received by the patent are provided.
+
+Lastly, of each of these patents (and citing patents), the application and publication dates, the number of claims, and the main USPC patent class are reported.
+
 ## Reproducibility
 To reproduce the database tables, please follow these steps:
 1. Install [Git](https://git-scm.com/)
@@ -35,7 +41,7 @@ Otherwise, please fork the repository, modify the code as you think is the best,
 ### Database
 The database is released under a [*CC-BY 4.0 License*](https://creativecommons.org/licenses/by/4.0/).
 
-The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/) and [US Census](https://www.census.gov/). You can find further references to the raw files used in the Makefile file.
+The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/), the [US Census](https://www.census.gov/), and the USPTO's [Patent Examination Research Dataset (PatEx)](https://www.uspto.gov/learning-and-resources/electronic-data-products/patent-examination-research-dataset-public-pair). You can find further references to the raw files used in the Makefile file.
 
 ## Folders structure
 ```

diff --git a/makefile.png b/makefile.png
diff --git a/src/make-msa-label-database.py b/src/make-msa-label-database.py
@@ -25,18 +25,17 @@ def main():
 
     df_patent = pd.read_table(
         args.input,
-        dtype=str)
+        usecols=[
+            'cbsa_id', 
+            'csa_id', 
+            'cbsa_label']) \
+        .drop_duplicates()
 
     dir, file = os.path.split(args.output)
     if not os.path.exists(dir):
         os.makedirs(dir)
 
-    df_patent[[
-        'cbsa_id', 
-        'csa_id', 
-        'cbsa_label']] \
-    .drop_duplicates() \
-    .to_csv(
+    df_patent.to_csv(
         args.output, 
         sep='\t', 
         index=False, 

diff --git a/src/make-patent-database.py b/src/make-patent-database.py
@@ -14,94 +14,23 @@
 import pandas as pd
 import geopandas as gpd
 import os
-import requests
 from parse_args import parse_io
 
 
-def fix_dates(dataframe:pd.DataFrame, dates_column:str):
-    """Fix wrong dates in the PatentsView database
-    Some (grant and application) dates on PatentsView are wrongly reported 
-      and cannot be converted into proper dates. However, if you look on the
-      PatentsView website, most (all?) are correct. Therefore, this module
-      uses the PatentsView APIs to retrieve the correct dates 
-      (or, as a second-best, it tries to fix them with a simple heuristic)
-    """
-    # Use the PatentsView API to fix those application dates 
-    #  that cannot be coerced into proper dates
-    dataframe['date_'] = pd.to_datetime(
-        dataframe[dates_column], errors='coerce')
-    dataframe.sort_values(by=['patent_id','date_'], inplace=True)
-    dataframe.set_index('patent_id', inplace=True)
-    patents_to_fix = ','.join([f'{{"patent_number":"{patent_id}"}}' \
-        for patent_id in dataframe[dataframe.date_.isna()].index])
-    patents_to_fix_n = sum(dataframe.date_.isna())
-    if patents_to_fix_n>0:
-        query = ''.join([
-            'https://api.patentsview.org/patents/query?q={"_or":[',
-            patents_to_fix,
-            ']}&f=["patent_number","patent_date"]&o={"per_page":',
-            str(patents_to_fix_n), '}'])
-        response = requests.get(query)
-        df_fix = pd.DataFrame(response.json()['patents'], dtype=str)
-        df_fix.rename(columns={
-            'patent_number':'patent_id', 
-            'patent_date':dates_column}, inplace=True)
-        df_fix.sort_values(by=['patent_id',dates_column], inplace=True)
-        df_fix.set_index('patent_id', inplace=True)
-        dataframe.update(df_fix)
-    dataframe.drop(columns='date_', inplace=True)
-    dataframe.reset_index(inplace=True)
-    dataframe.sort_values(by=['patent_id',dates_column], inplace=True)
-
-    # At this point, all the mistakes should have been fixed
-    #  Anyhow, the script will fix dates that are possibly still wrong 
-    #  applying some heuristic with the best guesses we can do, 
-    #  given the information provided
-    # Fix any date that has "00" as day, putting "01" inplace
-    subset = dataframe[dates_column].str.endswith('00')
-    if len(subset)>0:
-        dataframe.loc[
-            subset,dates_column] = dataframe.loc[
-                subset,dates_column].str[:-2] + '01'
-        # Fix any date that who's year doesn't start with "19" or "20", 
-        #  putting "19" inplace
-        subset = dataframe[dates_column].str[:2].isin(['19','20'])
-        dataframe.loc[
-            ~subset,dates_column] = '19' + dataframe.loc[
-                ~subset,dates_column].str[2:]
-
-    return dataframe
-
-
 def main():
     args = parse_io()
 
     df_patent = pd.read_table(
         args.input_list[0], # patent.tsv.zip
         usecols=[
-            'id',
-            'date',
-            'num_claims'],
+            'id'],
         dtype=str) \
         .rename(columns={
-            'id':'patent_id',
-            'date':'grant_date'})
+            'id':'patent_id'})
     df_patent = df_patent[df_patent.patent_id.str.isnumeric()]
 
-    df_application = pd.read_table(
-        args.input_list[1], # application.tsv.zip
-        usecols=[
-            'patent_id',
-            'date'],
-        dtype=str) \
-        .rename(columns={
-            'date':'appln_date'})
-
-    df_patent = pd.merge(df_patent, df_application)
-    del df_application
-
     df_patent_inventor = pd.read_table(
-        args.input_list[2], # patent_inventor.tsv.zip
+        args.input_list[1], # patent_inventor.tsv.zip
         dtype=str) \
         .dropna()
 
@@ -120,7 +49,7 @@ def main():
         how='left')
 
     df_location = pd.read_table(
-        args.input_list[3], # location.tsv.zip
+        args.input_list[2], # location.tsv.zip
         usecols=[
             'id',
             'latitude',
@@ -148,8 +77,8 @@ def main():
         inplace=True)
 
     # M1 = Metropolitan areas
-    df_cbsa = gpd.read_file( # cb_2018_us_cbsa_20m.zip
-        f'zip://{args.input_list[4]}') \
+    df_cbsa = gpd.read_file( # cb_2019_us_cbsa_20m.zip
+        f'zip://{args.input_list[3]}') \
         .query('LSAD=="M1"') \
         .drop(columns=['LSAD','ALAND','AWATER']) \
         .rename(columns={
@@ -162,9 +91,7 @@ def main():
         op='within') \
         .drop(columns='index_right')
     df_patent = pd.DataFrame(df_patent)
-
-    for date_column in ['grant_date', 'appln_date']:
-        df_patent = fix_dates(df_patent, date_column)
+    del df_cbsa
 
     dir, file = os.path.split(args.output)
     if not os.path.exists(dir):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Carlo Bottai (Eindhoven University of Technology)