diff --git a/README.md b/README.md index bf7a97b..3f0b25a 100644 --- a/README.md +++ b/README.md @@ -58,15 +58,15 @@ pypatent.Search('TTL/(tennis AND (racquet OR racket))') Alternatively, you can specify one or more Field Code arguments to search within the specified fields. Multiple Field Code arguments will create a search with AND logic. OR logic can be used within a single argument. For more complex logic, use a custom string. ```python -pypatent.Search(pn='adobe', ttl='software') # Equivalent to search('PN/adobe AND TTL/software') -pypatent.Search(pn=('adobe or macromedia'), ttl='software') # Equivalent to search('PN/(adobe or macromedia) AND TTL/software') +pypatent.Search(an='adobe', ttl='software') # Equivalent to search('AN/adobe AND TTL/software') +pypatent.Search(an=('adobe or macromedia'), ttl='software') # Equivalent to search('AN/(adobe or macromedia) AND TTL/software') ``` #### Combining search methods 1 and 2 String criteria can be used in conjunction with Field Code arguments: ```python -pypatent.Search('acrobat', pn='adobe', ttl='software') # Equivalent to search('acrobat AND PN/adobe AND TTL/software') +pypatent.Search('acrobat', an='adobe', ttl='software') # Equivalent to search('acrobat AND AN/adobe AND TTL/software') ``` The Field Code arguments have the same meaning as on the [USPTO site](http://patft.uspto.gov/netahtml/PTO/search-adv.htm). @@ -259,4 +259,4 @@ print(res) This version makes searching and storing patent data easier: * Simplified to 2 objects: `Search` and `Patent` * A `Search` object searches the USPTO site and can output the results as a DataFrame or list. It can scrape the details of each patent, or just get the patent title and URL. Most users will only need to use this object. -* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search. \ No newline at end of file +* A `Patent` object fetches and holds a single patent's info. Fetching the patent's details is now optional. This object should only be used when you already have the patent URL and aren't conducting a search. diff --git a/pypatent/__init__.py b/pypatent/__init__.py index c68d0b0..c784917 100644 --- a/pypatent/__init__.py +++ b/pypatent/__init__.py @@ -6,7 +6,6 @@ import pandas as pd from selenium import webdriver - class WebConnection: def __init__(self, use_selenium: bool = False, @@ -198,6 +197,7 @@ def __repr__(self): class Search: + def __init__(self, string: str = None, results_limit: int = 50, @@ -263,46 +263,103 @@ def __init__(self, self.web_connection = web_connection else: self.web_connection = WebConnection() - args = {k: str(v).replace(' ', '-') for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]} - searchstring = ' AND '.join(['%s/%s' % (key, value) for (key, value) in args.items() if key not in ['results_limit']]) + + args = {k: v.lower() for k, v in locals().items() if v and v is not self and v not in [get_patent_details, results_limit, web_connection]} + + search_codes = dict({ + 'PN': 'Patent Number', + 'ISD': 'Issue Date', + 'TTL': 'Title', + 'ABST': 'Abstract', + 'ACLM': 'Claim(s)', + 'SPEC': 'Description/Specification', + 'CCL': 'Current US Classification', + 'CPC': 'Current CPC Classification', + 'CPCL': 'Current CPC Classification Class', + 'ICL': 'International Classification', + 'APN': 'Application Serial Number', + 'APD': 'Application Date', + 'APT': 'Application Type', + 'GOVT': 'Government Interest', + 'FMID': 'Patent Family ID', + 'PARN': 'Parent Case Information', + 'RLAP': 'Related US App. Data', + 'RLFD': 'Related Application Filing Date', + 'PRIR': 'Foreign Priority', + 'PRAD': 'Priority Filing Date', + 'PCT': 'PCT Information', + 'PTAD': 'PCT Filing Date', + 'PT3D': 'PCT 371c124 Date', + 'PPPD': 'Prior Published Document Date', + 'REIS': 'Reissue Data', + 'RPAF': 'Reissued Patent Application Filing Date', + 'AFFF': '130(b) Affirmation Flag', + 'AFFT': '130(b) Affirmation Statement', + 'IN': 'Inventor Name', + 'IC': 'Inventor City', + 'IS': 'Inventor State', + 'ICN': 'Inventor Country', + 'AANM': 'Applicant Name', + 'AACI': 'Applicant City', + 'AAST': 'Applicant State', + 'AACO': 'Applicant Country', + 'AAAT': 'Applicant Type', + 'LREP': 'Attorney or Agent', + 'AN': 'Assignee Name', + 'AC': 'Assignee City', + 'AS': 'Assignee State', + 'ACN': 'Assignee Country', + 'EXP': 'Primary Examiner', + 'EXA': 'Assistant Examiner', + 'REF': 'Referenced By', + 'FREF': 'Foreign References', + 'OREF': 'Other References', + 'COFC': 'Certificate of Correction', + 'REEX': 'Re-Examination Certificate', + 'PTAB': 'PTAB Trial Certificate', + 'SEC': 'Supplemental Exam Certificate', + 'ILRN': 'International Registration Number', + 'ILRD': 'International Registration Date', + 'ILPD': 'International Registration Publication Date', + 'ILFD': 'Hague International Filing Date' + }) + for k, v in args.items(): + if k == 'string' and '/' in v: + (kk, p, v) = v.partition('/') + if v and kk.upper() in search_codes: + args[k] = '"{}"'.format(v) + searchstring = ' and '.join(['%s/%s' % (key, value) for (key, value) in args.items()]) searchstring = searchstring.replace('string/', '') searchstring = searchstring.replace(' ', '+') + searchstring = searchstring.replace('-and-', '+and+') replace_dict = {'/': '%2F'} - for k, v in replace_dict.items(): searchstring = searchstring.replace(k, v) base_url = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50&Query=' - url = base_url + searchstring + '&d=PTXT' r = self.web_connection.get(url) s = BeautifulSoup(r, 'html.parser') - total_results = int(s.find(string=re.compile('out of')).find_next().text.strip()) - - patents = self.get_patents_from_results_url(url, limit=results_limit) - - num_results_fetched = len(patents) - - list_num = 2 - - base_url_nextpgs = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&f=S&l=50&d=PTXT' - - url_pre = base_url_nextpgs + '&OS=' + searchstring + '&RS=' + searchstring + '&Query=' + searchstring + '&TD=' + str(total_results) + '&Srch1=' + searchstring + '&NextList' - url_post = '=Next+50+Hits' - - while (num_results_fetched < total_results) and (num_results_fetched < results_limit): - this_url = url_pre + str(list_num) + url_post - thispatents = self.get_patents_from_results_url(this_url) - patents.extend(thispatents) - + if s.find(string=re.compile('out of')): #only proceed with function if search produces results + total_results = int(s.find(string=re.compile('out of')).find_next().text.strip()) + patents = self.get_patents_from_results_url(url, limit=results_limit) num_results_fetched = len(patents) + list_num = 2 + while (num_results_fetched < total_results) and (num_results_fetched < results_limit): + url_nextpg = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p={0}&f=S&l=50&Query={1}&d=PTXT'.format(list_num, searchstring) + + thispatents = self.get_patents_from_results_url(url_nextpg, limit=(results_limit - num_results_fetched)) + patents.extend(thispatents) - if num_results_fetched >= results_limit: - patents = patents[:results_limit] + num_results_fetched = len(patents) - list_num += 1 + if num_results_fetched >= results_limit: + patents = patents[:results_limit] + list_num += 1 + else: + patents = [] self.patents = patents def get_patents_from_results_url(self, url: str, limit: int = None) -> list: @@ -314,18 +371,17 @@ def get_patents_from_results_url(self, url: str, limit: int = None) -> list: i.text.replace('\n', '').strip() != ''] patents = [] - for patent_num_idx in range(0, len(patents_raw_list), 2): - if limit and (patent_num_idx + 1) > limit: + if limit and (patent_num_idx/2 + 1) > limit: break patent_title = patents_raw_list[patent_num_idx + 1][0] patent_title = re.sub(' +', ' ', patent_title) patent_link = patents_raw_list[patent_num_idx][1] + p = Patent(patent_title, patent_link, self.web_connection) if self.get_patent_details: p.fetch_details() patents.append(p) - return patents def as_dataframe(self) -> pd.DataFrame: