1.はじめに
- 特許データ全文を取得したい時に便利!
- LLMの進歩もあり、特許の全文データに色々したい事が多発。書誌事項や要約のテキストまでは結構とれるサービスが多いものの、きちんと全文(特に海外の特許文献)をとりたい。
2.コード
- 準備としては、SerpAPIでアカウントを作成し、APIキーを取得。
- pandas dataframeに公報番号(Google PatentsのURLの後ろの部分)を格納しておく。
- 下記のコードを使って、全文データを取得して保存後のdataframeをxlsxで出力
colabortory上で実行する場合、
!pip install google-search-results==2.4.2 -q
from serpapi import GoogleSearch
from google.colab import userdata
SERPAPI_API_KEY = userdata.get('SERP_API_KEY')
#@title serpapiで番号取得
def get_pat_summary_pdf(patent_id):
params = {
"engine": "google_patents_details",
"patent_id": "patent/" + patent_id,
"api_key": SERPAPI_API_KEY,
}
search = GoogleSearch(params)
results = search.get_dict()
return results
- 上記でdescriptino等の全文データへのリンクを取得しておいて(そうしないと全文が取れない)、Details APIで全部データをとる。
#@title id指定して全文取得抽出関数
import requests
def get_content_from_url(patent_id):
try:
res = get_pat_summary_pdf(patent_id)
desc_link = res["description_link"]
response = requests.get(desc_link)
return response.text
except:
return "error"
- このdesc_linkにデータが入ってる。
- developer planだと1000件/Hの制限があるので、100件ずつ概ね5分毎に取得するように設定
#@title 100件ずつ処理させる
import math
BATCH_LEN = 100
LOOP_NUM = math.ceil(numdf.shape[0]/BATCH_LEN)
print(LOOP_NUM)
import time
BASE_URI = "path/to/save"
for ind in range(0,LOOP_LEN):
time.sleep(120)
df_temp = df.loc[ind * BATCH_LEN:BATCH_LEN]
#display(df_temp)
df_temp["desc"] = df_temp["公報番号"].apply(get_content_from_url)
df_temp.to_excel(BASE_URI + "desc_" + str(ind) + ".xlsx")
3.その他
DetailsAPIはこんなレスポンスなので、description以外でも請求項全文など色々取れる。
{
...
"title": "String - Patent / scholar document title",
"type": "String - Type of the document. The value can either be a 'patent' or a 'scholar'",
"header": "String - Document header",
"pdf": "String - URL to the PDF",
"publication_number": "String - The publication number of the patent",
"publication_year": "String - The publication year of the scholar document",
"publication_venue": "String - The publication venue of the scholar document",
"full_view_url": "String - URL to the full view",
"institution_url": "String - URL to the institution",
"cited_by_url": "String - URL to whom the document is cited by",
"main_url": "String - Main URL to the document",
"other_versions_url": "String - URL to the other versions of the document",
"country": "String - Country of the patent",
"prior_art_keywords": "Array - Keywords of prior art",
"prior_art_date": "String - Date of prior art",
"application_number": "String - Patent's application number",
"inventors": [
{
"name": "String - Inventor's name",
"link": "String - Search link of the inventor",
"serpapi_link": "String - URL to the SerpApi search",
}
],
"assignees": "Array - List of assignees",
"authors": "Array - List of authors",
"priority_year": "String - Patent's priority year",
"priority_date": "String - Patent's priority date",
"filing_date": "String - Patent's filing date",
"publication_date": "String - Patent's publication date",
"worldwide_applications": {
"<year>": [
{
"filing_date": "String - Filing date of the application",
"country_code": "String - Country code of the application",
"application_number": "String - Application number",
"document_id": "String - Document ID",
"legal_status_cat": "String - Legal status category",
"legal_status": "String - Legal status",
"this_app": "Boolean - True if it's the current application",
}
]
},
"events": [
{
"date": "String - Date of the event",
"title": "String - Title of the event",
"type": "String - Event type",
"critical": "Boolean - True if the event is critical",
"assignee_search": "String - Associated assignee",
}
],
"external_links": [
{
"text": "String - Link text",
"link": "String - Link URL",
}
],
"images": "Array - List of document's images",
"classifications": [
{
"code": "String - Classification code",
"description": "String - The description",
"leaf": "Boolean - True if it's a leaf classification",
"is_cpc": "Boolean - True if it's a CPC",
}
],
"abstract": "String - Abstract of the patent",
"abstract_original": "String - Abstract in the original language",
"snippet": "String - Snippet of the scholar document",
"description_link": "String - URL to the HTML content of the patent description",
"claims": "Array - List of patent claims",
"child_applications": [
{
"application_number": "String - Application number",
"relation_type": "String - Relation type",
"representative_publication": "String - Representative publication number",
"primary_language": "String - Primary language",
"priority_date": "String - Priority date of the application",
"filing_date": "String - Filing date of the application",
"title": "String - Application title",
}
]
"parent_applications": "Array - List of parent applications, with the same structure of `child_applications`",
"priority_applications": "Array - List of priority applications, with the same structure of `child_applications`",
"family_id": "String - Family ID",
"applications_claiming_priority": "Array - List of applications claiming priority, with the same structure of `child_applications`",
"patent_citations": {
"original": [
{
"patent_id": "String - ID of the cited patent",
"serpapi_link": "String - URL to the SerpApi search",
"publication_number": "String - Publication number of the cited patent",
"primary_language": "String - Primary language of the cited patent",
"priority_date": "String - Priority date of the cited patent",
"publication_date": "String - Publication date of the cited patent",
"assignee_original": "String - Original assignee",
"title": "String - Title of the cited patent"
}
],
"family_to_family": "Array - List of family to family citations, with the same structure of `original`",
},
"non_patent_citations": "Array - List of non-patent citations",
"cited_by": {
"original": "Array - List of original citations, with the same structure of `patent_citations.original`",
"family_to_family": "Array - List of family to family citations, with the same structure of `patent_citations.original`",
},
"similar_documents": [
{
"is_patent": "Boolean - True if it's a patent",
"is_scholar": "Boolean - True if it's a scholar document",
"patent_id": "String - ID of the patent / scholar",
"serpapi_link": "String - URL to the SerpApi search",
"publication_number": "String - Publication number of the patent",
"primary_language": "String - Primary language of the patent",
"publication_date": "String - Publication date of the patent / scholar",
"title": "String - Title of the patent / scholar",
"scholar_id": "String - ID of the scholar document",
"scholar_authors": "String - Authors of the scholar document",
}
],
"legal_events": [
{
"date": "String - Date of the event",
"code": "String - Event code",
"title": "String - Event title",
"attributes": [
{
"label": "String - Attribute label",
"value": "String - Attribute value",
}
]
}
],
"concepts": {
"match": [
{
"id": "String - Concept ID",
"name": "String - Concept name",
"domain": "String - Concept domain",
"similarity": "Float - Concept similarity",
"sections": "Array - List of sections",
"count": "Integer - Count of the concept",
}
]
},
...
}