Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@ pypatent.Search('TTL/(tennis AND (racquet OR racket))')

Alternatively, you can specify one or more Field Code arguments to search within the specified fields. Multiple Field Code arguments will create a search with AND logic. OR logic can be used within a single argument. For more complex logic, use a custom string.
```python
pypatent.Search(pn='adobe', ttl='software') # Equivalent to search('PN/adobe AND TTL/software')
pypatent.Search(pn=('adobe or macromedia'), ttl='software') # Equivalent to search('PN/(adobe or macromedia) AND TTL/software')
pypatent.Search(an='adobe', ttl='software') # Equivalent to search('AN/adobe AND TTL/software')
pypatent.Search(an=('adobe or macromedia'), ttl='software') # Equivalent to search('AN/(adobe or macromedia) AND TTL/software')
```

#### Combining search methods 1 and 2

String criteria can be used in conjunction with Field Code arguments:
```python
pypatent.Search('acrobat', pn='adobe', ttl='software') # Equivalent to search('acrobat AND PN/adobe AND TTL/software')
pypatent.Search('acrobat', an='adobe', ttl='software') # Equivalent to search('acrobat AND AN/adobe AND TTL/software')
```

The Field Code arguments have the same meaning as on the [USPTO site](http://patft.uspto.gov/netahtml/PTO/search-adv.htm).
Expand Down Expand Up @@ -259,4 +259,4 @@ print(res)
This version makes searching and storing patent data easier:
* Simplified to 2 objects: `Search` and `Patent`
* A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object.
* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search.
* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search.
114 changes: 85 additions & 29 deletions pypatent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pandas as pd
from selenium import webdriver


class WebConnection:
def __init__(self,
use_selenium: bool = False,
Expand Down Expand Up @@ -198,6 +197,7 @@ def __repr__(self):


class Search:

def __init__(self,
string: str = None,
results_limit: int = 50,
Expand Down Expand Up @@ -263,46 +263,103 @@ def __init__(self,
self.web_connection = web_connection
else:
self.web_connection = WebConnection()
args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]}
searchstring = ' AND '.join(['%s/%s' % (key, value) for (key, value) in args.items() if key not in ['results_limit']])

args = {k: v.lower() for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]}

search_codes = dict({
'PN': 'Patent Number',
'ISD': 'Issue Date',
'TTL': 'Title',
'ABST': 'Abstract',
'ACLM': 'Claim(s)',
'SPEC': 'Description/Specification',
'CCL': 'Current US Classification',
'CPC': 'Current CPC Classification',
'CPCL': 'Current CPC Classification Class',
'ICL': 'International Classification',
'APN': 'Application Serial Number',
'APD': 'Application Date',
'APT': 'Application Type',
'GOVT': 'Government Interest',
'FMID': 'Patent Family ID',
'PARN': 'Parent Case Information',
'RLAP': 'Related US App. Data',
'RLFD': 'Related Application Filing Date',
'PRIR': 'Foreign Priority',
'PRAD': 'Priority Filing Date',
'PCT': 'PCT Information',
'PTAD': 'PCT Filing Date',
'PT3D': 'PCT 371c124 Date',
'PPPD': 'Prior Published Document Date',
'REIS': 'Reissue Data',
'RPAF': 'Reissued Patent Application Filing Date',
'AFFF': '130(b) Affirmation Flag',
'AFFT': '130(b) Affirmation Statement',
'IN': 'Inventor Name',
'IC': 'Inventor City',
'IS': 'Inventor State',
'ICN': 'Inventor Country',
'AANM': 'Applicant Name',
'AACI': 'Applicant City',
'AAST': 'Applicant State',
'AACO': 'Applicant Country',
'AAAT': 'Applicant Type',
'LREP': 'Attorney or Agent',
'AN': 'Assignee Name',
'AC': 'Assignee City',
'AS': 'Assignee State',
'ACN': 'Assignee Country',
'EXP': 'Primary Examiner',
'EXA': 'Assistant Examiner',
'REF': 'Referenced By',
'FREF': 'Foreign References',
'OREF': 'Other References',
'COFC': 'Certificate of Correction',
'REEX': 'Re-Examination Certificate',
'PTAB': 'PTAB Trial Certificate',
'SEC': 'Supplemental Exam Certificate',
'ILRN': 'International Registration Number',
'ILRD': 'International Registration Date',
'ILPD': 'International Registration Publication Date',
'ILFD': 'Hague International Filing Date'
})
for k, v in args.items():
if k == 'string' and '/' in v:
(kk, p, v) = v.partition('/')
if v and kk.upper() in search_codes:
args[k] = '"{}"'.format(v)
searchstring = ' and '.join(['%s/%s' % (key, value) for (key, value) in args.items()])
searchstring = searchstring.replace('string/', '')
searchstring = searchstring.replace(' ', '+')
searchstring = searchstring.replace('-and-', '+and+')

replace_dict = {'/': '%2F'}

for k, v in replace_dict.items():
searchstring = searchstring.replace(k, v)

base_url = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50&Query='

url = base_url + searchstring + '&d=PTXT'
r = self.web_connection.get(url)
s = BeautifulSoup(r, 'html.parser')
total_results = int(s.find(string=re.compile('out of')).find_next().text.strip())

patents = self.get_patents_from_results_url(url, limit=results_limit)

num_results_fetched = len(patents)

list_num = 2

base_url_nextpgs = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&f=S&l=50&d=PTXT'

url_pre = base_url_nextpgs + '&OS=' + searchstring + '&RS=' + searchstring + '&Query=' + searchstring + '&TD=' + str(total_results) + '&Srch1=' + searchstring + '&NextList'
url_post = '=Next+50+Hits'

while (num_results_fetched < total_results) and (num_results_fetched < results_limit):
this_url = url_pre + str(list_num) + url_post
thispatents = self.get_patents_from_results_url(this_url)
patents.extend(thispatents)

if s.find(string=re.compile('out of')): #only proceed with function if search produces results
total_results = int(s.find(string=re.compile('out of')).find_next().text.strip())
patents = self.get_patents_from_results_url(url, limit=results_limit)
num_results_fetched = len(patents)
list_num = 2
while (num_results_fetched < total_results) and (num_results_fetched < results_limit):
url_nextpg = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p={0}&f=S&l=50&Query={1}&d=PTXT'.format(list_num, searchstring)

thispatents = self.get_patents_from_results_url(url_nextpg, limit=(results_limit - num_results_fetched))
patents.extend(thispatents)

if num_results_fetched >= results_limit:
patents = patents[:results_limit]
num_results_fetched = len(patents)

list_num += 1
if num_results_fetched >= results_limit:
patents = patents[:results_limit]

list_num += 1
else:
patents = []
self.patents = patents

def get_patents_from_results_url(self, url: str, limit: int = None) -> list:
Expand All @@ -314,18 +371,17 @@ def get_patents_from_results_url(self, url: str, limit: int = None) -> list:
i.text.replace('\n', '').strip() != '']

patents = []

for patent_num_idx in range(0, len(patents_raw_list), 2):
if limit and (patent_num_idx + 1) > limit:
if limit and (patent_num_idx/2 + 1) > limit:
break
patent_title = patents_raw_list[patent_num_idx + 1][0]
patent_title = re.sub(' +', ' ', patent_title)
patent_link = patents_raw_list[patent_num_idx][1]

p = Patent(patent_title, patent_link, self.web_connection)
if self.get_patent_details:
p.fetch_details()
patents.append(p)

return patents

def as_dataframe(self) -> pd.DataFrame:
Expand Down