Skip to content

Commit

Permalink
Use 2019 shapefile, add USPC class, and fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
n3ssuno committed Feb 8, 2021
1 parent 4bf3c0b commit 212f380
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 144 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Carlo Bottai (Eindhoven University of Technology)
24 changes: 16 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ DATA_DIR_INTM = $(DATA_DIR)/interim
DATA_DIR_PROC = $(DATA_DIR)/processed

DATA_DIR_USPTO = $(DATA_DIR_RAW)/patentsview
DATA_DIR_PATEX = $(DATA_DIR_RAW)/patex
DATA_DIR_SHP = $(DATA_DIR_RAW)/cartography

SCRIPT_DIR = src
Expand All @@ -35,19 +36,26 @@ USPTO_URL = https://s3.amazonaws.com/data.patentsview.org/20200929/download
USPTO_FILES = patent.tsv.zip application.tsv.zip patent_inventor.tsv.zip location.tsv.zip uspatentcitation.tsv.zip
USPTO_TARGETS := $(foreach F,$(USPTO_FILES),$(DATA_DIR_USPTO)/$F)

SHP_URL = https://www2.census.gov/geo/tiger/GENZ2018/shp
SHP_FILES = cb_2018_us_cbsa_20m.zip
SHP_TARGETS := $(foreach F,$(SHP_FILES),$(DATA_DIR_SHP)/$F)

$(USPTO_TARGETS): $(DATA_DIR_USPTO)/%: $(SCRIPT_DIR)/download.py
python $< -i $(USPTO_URL)/$* -o $@

PATEX_URL = https://bulkdata.uspto.gov/data/patent/pair/economics/2019
PATEX_FILES = application_data.csv.zip
PATEX_TARGETS := $(foreach F,$(PATEX_FILES),$(DATA_DIR_PATEX)/$F)

$(PATEX_TARGETS): $(DATA_DIR_PATEX)/%: $(SCRIPT_DIR)/download.py
python $< -i $(PATEX_URL)/$* -o $@

SHP_URL = https://www2.census.gov/geo/tiger/GENZ2019/shp
SHP_FILES = cb_2019_us_cbsa_20m.zip
SHP_TARGETS := $(foreach F,$(SHP_FILES),$(DATA_DIR_SHP)/$F)

$(SHP_TARGETS): $(DATA_DIR_SHP)/%: $(SCRIPT_DIR)/download.py
python $< -i $(SHP_URL)/$* -o $@

#################################################

$(DATA_DIR_INTM)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-database.py $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/application.tsv.zip $(DATA_DIR_USPTO)/patent_inventor.tsv.zip $(DATA_DIR_USPTO)/location.tsv.zip $(DATA_DIR_SHP)/cb_2018_us_cbsa_20m.zip
$(DATA_DIR_INTM)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-database.py $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/patent_inventor.tsv.zip $(DATA_DIR_USPTO)/location.tsv.zip $(DATA_DIR_SHP)/cb_2019_us_cbsa_20m.zip
python $< -I $(filter-out $<,$^) -o $@

$(DATA_DIR_PROC)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-msa-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
Expand All @@ -56,15 +64,15 @@ $(DATA_DIR_PROC)/msa_patent.tsv.zip: $(SCRIPT_DIR)/make-patent-msa-database.py $
$(DATA_DIR_PROC)/msa_patent_inventor.tsv.zip: $(SCRIPT_DIR)/make-patent-inventor-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
python $< -i $(filter-out $<,$^) -o $@

$(DATA_DIR_PROC)/msa_patent_info.tsv.zip: $(SCRIPT_DIR)/make-patent-info-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
python $< -i $(filter-out $<,$^) -o $@

$(DATA_DIR_PROC)/msa_label.tsv.zip: $(SCRIPT_DIR)/make-msa-label-database.py $(DATA_DIR_INTM)/msa_patent.tsv.zip
python $< -i $(filter-out $<,$^) -o $@

$(DATA_DIR_PROC)/msa_citation.tsv.zip: $(SCRIPT_DIR)/make-citation-database.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_USPTO)/uspatentcitation.tsv.zip
python $< -I $(filter-out $<,$^) -o $@

$(DATA_DIR_PROC)/msa_patent_info.tsv.zip: $(SCRIPT_DIR)/make-patent-info-database.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_PROC)/msa_citation.tsv.zip $(DATA_DIR_USPTO)/patent.tsv.zip $(DATA_DIR_USPTO)/application.tsv.zip $(DATA_DIR_PATEX)/application_data.csv.zip
python $< -I $(filter-out $<,$^) -o $@

$(DOCS_DIR)/README_tables.md: $(SCRIPT_DIR)/make-readme-tables.py $(DATA_DIR_PROC)/msa_patent.tsv.zip $(DATA_DIR_PROC)/msa_patent_inventor.tsv.zip $(DATA_DIR_PROC)/msa_patent_info.tsv.zip $(DATA_DIR_PROC)/msa_label.tsv.zip $(DATA_DIR_PROC)/msa_citation.tsv.zip
python $< -I $(filter-out $<,$^) -o $@
README.md: $(DOCS_DIR)/README_base.md $(DOCS_DIR)/README_tables.md
Expand Down
50 changes: 30 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Patenting in the US Metropolitan Areas
This repository builds a database that collects information about the US patent applications developed by inventors located in Metropolitan Statistical Areas (MSA).

The data are aggregated at the Core Based Statistical Area (CBSA) level, based on the localization (latitude and longitude) of each inventor, as provided by PatentsView. The boundaries of each CBSA are constant over time and based on the data provided by the US Census (version 2019). To each inventor, within a patent, is assigned a fraction of the patent proportional to the size of the "inventing team". As well, a fractional count of the inventors of each patent, located in a given metropolitan area, is provided.

Moreover, for each patent (partly) invented in a metropolitan area, the forward citations received by the patent are provided.

Lastly, of each of these patents (and citing patents), the application and publication dates, the number of claims, and the main USPC patent class are reported.

## Reproducibility
To reproduce the database tables, please follow these steps:
1. Install [Git](https://git-scm.com/)
Expand Down Expand Up @@ -35,7 +41,7 @@ Otherwise, please fork the repository, modify the code as you think is the best,
### Database
The database is released under a [*CC-BY 4.0 License*](https://creativecommons.org/licenses/by/4.0/).

The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/) and [US Census](https://www.census.gov/). You can find further references to the raw files used in the Makefile file.
The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/), the [US Census](https://www.census.gov/), and the USPTO's [Patent Examination Research Dataset (PatEx)](https://www.uspto.gov/learning-and-resources/electronic-data-products/patent-examination-research-dataset-public-pair). You can find further references to the raw files used in the Makefile file.

## Folders structure
```
Expand Down Expand Up @@ -74,31 +80,35 @@ The following tables describe the database files, showing the first five rows of
### msa_patent_inventor
| patent_id | inventor_id | inventor_share |
|-------------|---------------|------------------|
| 10000000 | 5073021-1 | 1 |
| 10000007 | 9473749-3 | 0.2 |
| 10000007 | 9862137-3 | 0.2 |
| 10000007 | 9862137-4 | 0.2 |
| 10000007 | 9862137-5 | 0.2 |
| 10000000 | 5073021-1 | 1 |
| 10003756 | 10003756-2 | 0.5 |
| 10003780 | 9495415-4 | 0.2 |
| 10006993 | 5763054-3 | 1 |
| 10007786 | 6067410-1 | 0.333333 |


### msa_patent_info
| patent_id | grant_date | appln_date | num_claims |
|-------------|--------------|--------------|--------------|
| 10000000 | 2018-06-19 | 2015-03-10 | 20 |
| 10000007 | 2018-06-19 | 2016-06-10 | 24 |
| 10000008 | 2018-06-19 | 2014-12-01 | 11 |
| 10000009 | 2018-06-19 | 2015-02-05 | 21 |
| 10000010 | 2018-06-19 | 2016-06-29 | 20 |
| patent_id | grant_date | appln_date | num_claims | uspc_class |
|-------------|--------------|--------------|--------------|--------------|
| 10000000 | 2018-06-19 | 2015-03-10 | 20 | 356 |
| 10000002 | 2018-06-19 | 2014-12-30 | 9 | 428 |
| 10000003 | 2018-06-19 | 2013-03-12 | 18 | 156 |
| 10000004 | 2018-06-19 | 2015-12-17 | 6 | 428 |
| 10000005 | 2018-06-19 | 2012-08-03 | 4 | 156 |

Notes:
* Rename *patent_id* as *forward_citation_id* to merge this table with the *msa_citation* table.
* 5.4% of the *patent_id*s have no *uspc_class* (most of which, very old or very recent patents).


### msa_label
| cbsa_id | csa_id | cbsa_label |
|-----------|----------|---------------------------------------|
| 31080 | 348 | Los Angeles-Long Beach-Anaheim, CA |
| 33340 | 376 | Milwaukee-Waukesha-West Allis, WI |
| 35620 | 408 | New York-Newark-Jersey City, NY-NJ-PA |
| 41860 | 488 | San Francisco-Oakland-Hayward, CA |
| 40380 | 464 | Rochester, NY |
| csa_id | cbsa_id | cbsa_label |
|----------|-----------|---------------------------------------|
| 348 | 31080 | Los Angeles-Long Beach-Anaheim, CA |
| 376 | 33340 | Milwaukee-Waukesha, WI |
| 408 | 35620 | New York-Newark-Jersey City, NY-NJ-PA |
| 488 | 41860 | San Francisco-Oakland-Berkeley, CA |
| 464 | 40380 | Rochester, NY |


### msa_citation
Expand Down
8 changes: 7 additions & 1 deletion docs/README_base.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Patenting in the US Metropolitan Areas
This repository builds a database that collects information about the US patent applications developed by inventors located in Metropolitan Statistical Areas (MSA).

The data are aggregated at the Core Based Statistical Area (CBSA) level, based on the localization (latitude and longitude) of each inventor, as provided by PatentsView. The boundaries of each CBSA are constant over time and based on the data provided by the US Census (version 2019). To each inventor, within a patent, is assigned a fraction of the patent proportional to the size of the "inventing team". As well, a fractional count of the inventors of each patent, located in a given metropolitan area, is provided.

Moreover, for each patent (partly) invented in a metropolitan area, the forward citations received by the patent are provided.

Lastly, of each of these patents (and citing patents), the application and publication dates, the number of claims, and the main USPC patent class are reported.

## Reproducibility
To reproduce the database tables, please follow these steps:
1. Install [Git](https://git-scm.com/)
Expand Down Expand Up @@ -35,7 +41,7 @@ Otherwise, please fork the repository, modify the code as you think is the best,
### Database
The database is released under a [*CC-BY 4.0 License*](https://creativecommons.org/licenses/by/4.0/).

The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/) and [US Census](https://www.census.gov/). You can find further references to the raw files used in the Makefile file.
The raw data, elaborted by the scripts contained in this repository, are from [PatentsView](https://www.patentsview.org/), the [US Census](https://www.census.gov/), and the USPTO's [Patent Examination Research Dataset (PatEx)](https://www.uspto.gov/learning-and-resources/electronic-data-products/patent-examination-research-dataset-public-pair). You can find further references to the raw files used in the Makefile file.

## Folders structure
```
Expand Down
Binary file modified makefile.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 6 additions & 7 deletions src/make-msa-label-database.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,17 @@ def main():

df_patent = pd.read_table(
args.input,
dtype=str)
usecols=[
'cbsa_id',
'csa_id',
'cbsa_label']) \
.drop_duplicates()

dir, file = os.path.split(args.output)
if not os.path.exists(dir):
os.makedirs(dir)

df_patent[[
'cbsa_id',
'csa_id',
'cbsa_label']] \
.drop_duplicates() \
.to_csv(
df_patent.to_csv(
args.output,
sep='\t',
index=False,
Expand Down
87 changes: 7 additions & 80 deletions src/make-patent-database.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,94 +14,23 @@
import pandas as pd
import geopandas as gpd
import os
import requests
from parse_args import parse_io


def fix_dates(dataframe:pd.DataFrame, dates_column:str):
"""Fix wrong dates in the PatentsView database
Some (grant and application) dates on PatentsView are wrongly reported
and cannot be converted into proper dates. However, if you look on the
PatentsView website, most (all?) are correct. Therefore, this module
uses the PatentsView APIs to retrieve the correct dates
(or, as a second-best, it tries to fix them with a simple heuristic)
"""
# Use the PatentsView API to fix those application dates
# that cannot be coerced into proper dates
dataframe['date_'] = pd.to_datetime(
dataframe[dates_column], errors='coerce')
dataframe.sort_values(by=['patent_id','date_'], inplace=True)
dataframe.set_index('patent_id', inplace=True)
patents_to_fix = ','.join([f'{{"patent_number":"{patent_id}"}}' \
for patent_id in dataframe[dataframe.date_.isna()].index])
patents_to_fix_n = sum(dataframe.date_.isna())
if patents_to_fix_n>0:
query = ''.join([
'https://api.patentsview.org/patents/query?q={"_or":[',
patents_to_fix,
']}&f=["patent_number","patent_date"]&o={"per_page":',
str(patents_to_fix_n), '}'])
response = requests.get(query)
df_fix = pd.DataFrame(response.json()['patents'], dtype=str)
df_fix.rename(columns={
'patent_number':'patent_id',
'patent_date':dates_column}, inplace=True)
df_fix.sort_values(by=['patent_id',dates_column], inplace=True)
df_fix.set_index('patent_id', inplace=True)
dataframe.update(df_fix)
dataframe.drop(columns='date_', inplace=True)
dataframe.reset_index(inplace=True)
dataframe.sort_values(by=['patent_id',dates_column], inplace=True)

# At this point, all the mistakes should have been fixed
# Anyhow, the script will fix dates that are possibly still wrong
# applying some heuristic with the best guesses we can do,
# given the information provided
# Fix any date that has "00" as day, putting "01" inplace
subset = dataframe[dates_column].str.endswith('00')
if len(subset)>0:
dataframe.loc[
subset,dates_column] = dataframe.loc[
subset,dates_column].str[:-2] + '01'
# Fix any date that who's year doesn't start with "19" or "20",
# putting "19" inplace
subset = dataframe[dates_column].str[:2].isin(['19','20'])
dataframe.loc[
~subset,dates_column] = '19' + dataframe.loc[
~subset,dates_column].str[2:]

return dataframe


def main():
args = parse_io()

df_patent = pd.read_table(
args.input_list[0], # patent.tsv.zip
usecols=[
'id',
'date',
'num_claims'],
'id'],
dtype=str) \
.rename(columns={
'id':'patent_id',
'date':'grant_date'})
'id':'patent_id'})
df_patent = df_patent[df_patent.patent_id.str.isnumeric()]

df_application = pd.read_table(
args.input_list[1], # application.tsv.zip
usecols=[
'patent_id',
'date'],
dtype=str) \
.rename(columns={
'date':'appln_date'})

df_patent = pd.merge(df_patent, df_application)
del df_application

df_patent_inventor = pd.read_table(
args.input_list[2], # patent_inventor.tsv.zip
args.input_list[1], # patent_inventor.tsv.zip
dtype=str) \
.dropna()

Expand All @@ -120,7 +49,7 @@ def main():
how='left')

df_location = pd.read_table(
args.input_list[3], # location.tsv.zip
args.input_list[2], # location.tsv.zip
usecols=[
'id',
'latitude',
Expand Down Expand Up @@ -148,8 +77,8 @@ def main():
inplace=True)

# M1 = Metropolitan areas
df_cbsa = gpd.read_file( # cb_2018_us_cbsa_20m.zip
f'zip://{args.input_list[4]}') \
df_cbsa = gpd.read_file( # cb_2019_us_cbsa_20m.zip
f'zip://{args.input_list[3]}') \
.query('LSAD=="M1"') \
.drop(columns=['LSAD','ALAND','AWATER']) \
.rename(columns={
Expand All @@ -162,9 +91,7 @@ def main():
op='within') \
.drop(columns='index_right')
df_patent = pd.DataFrame(df_patent)

for date_column in ['grant_date', 'appln_date']:
df_patent = fix_dates(df_patent, date_column)
del df_cbsa

dir, file = os.path.split(args.output)
if not os.path.exists(dir):
Expand Down
Loading

0 comments on commit 212f380

Please sign in to comment.