I am trying to parse an SGML file that looks like this:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE PATDOC SYSTEM "ST32-US-Grant-025xml.dtd" [
<!ENTITY USD0453058-20020129-D00000.TIF SYSTEM "USD0453058-20020129-D00000.TIF" NDATA TIF>
<!ENTITY USD0453058-20020129-D00001.TIF SYSTEM "USD0453058-20020129-D00001.TIF" NDATA TIF>
<!ENTITY USD0453058-20020129-D00002.TIF SYSTEM "USD0453058-20020129-D00002.TIF" NDATA TIF>
<!ENTITY USD0453058-20020129-D00003.TIF SYSTEM "USD0453058-20020129-D00003.TIF" NDATA TIF>
<!ENTITY USD0453058-20020129-D00004.TIF SYSTEM "USD0453058-20020129-D00004.TIF" NDATA TIF>
<!ENTITY USD0453058-20020129-D00005.TIF SYSTEM "USD0453058-20020129-D00005.TIF" NDATA TIF>
]>
<PATDOC DTD="2.5" STATUS="Build 20020101">
<SDOBI>
<B100>
<B110><DNUM><PDAT>D0453058</PDAT></DNUM></B110>
<B130><PDAT>S1</PDAT></B130>
<B140><DATE><PDAT>20020129</PDAT></DATE></B140>
<B190><PDAT>US</PDAT></B190>
</B100>
<B200>
<B210><DNUM><PDAT>29124868</PDAT></DNUM></B210>
<B211US><PDAT>29</PDAT></B211US>
<B220><DATE><PDAT>20000613</PDAT></DATE></B220>
</B200>
<B400>
<B472>
<B474><PDAT>14</PDAT></B474>
</B472>
</B400>
<B500>
<B510>
<B511><PDAT>0201</PDAT></B511>
<B516><PDAT>7</PDAT></B516>
</B510>
<B520>
<B521><PDAT>D 2703</PDAT></B521>
<B522><PDAT> D2700</PDAT></B522>
<B522><PDAT> D2701</PDAT></B522>
<B522><PDAT> D2702</PDAT></B522>
<B522><PDAT> D2706</PDAT></B522>
</B520>
<B540><STEXT><PDAT>Corset</PDAT></STEXT></B540>
<B560>
<B561>
<PCIT>
<DOC><DNUM><PDAT>2131457</PDAT></DNUM>
<DATE><PDAT>19380900</PDAT></DATE>
<KIND><PDAT>A</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Tachat</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>450 41</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>2420575</PDAT></DNUM>
<DATE><PDAT>19470500</PDAT></DATE>
<KIND><PDAT>A</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Treadwell</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>450 39</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D164485</PDAT></DNUM>
<DATE><PDAT>19510900</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Fredericks</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2706</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D166760</PDAT></DNUM>
<DATE><PDAT>19520500</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>La Bue et al.</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D169510</PDAT></DNUM>
<DATE><PDAT>19530500</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>La Bue et al.</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D169511</PDAT></DNUM>
<DATE><PDAT>19530500</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>LaBue et al.</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D174114</PDAT></DNUM>
<DATE><PDAT>19550300</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Dior</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D174194</PDAT></DNUM>
<DATE><PDAT>19550300</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Prochaska</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2704</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D177067</PDAT></DNUM>
<DATE><PDAT>19560300</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Reisenfeld</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D177181</PDAT></DNUM>
<DATE><PDAT>19560300</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Kurzman</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2706</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D177982</PDAT></DNUM>
<DATE><PDAT>19560600</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Kahn</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2702</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>3090387</PDAT></DNUM>
<DATE><PDAT>19630500</PDAT></DATE>
<KIND><PDAT>A</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Hopper</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>450 74</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D206157</PDAT></DNUM>
<DATE><PDAT>19661100</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Maas</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2706</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D220741</PDAT></DNUM>
<DATE><PDAT>19710500</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Marcario et al.</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2707</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D265265</PDAT></DNUM>
<DATE><PDAT>19820700</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Stern et al.</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2710</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D276001</PDAT></DNUM>
<DATE><PDAT>19841000</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Locascio</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2710</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B561>
<PCIT>
<DOC><DNUM><PDAT>D373236</PDAT></DNUM>
<DATE><PDAT>19960900</PDAT></DATE>
<KIND><PDAT>S</PDAT></KIND>
</DOC>
<PARTY-US>
<NAM><SNM><STEXT><PDAT>Vass-Betts</PDAT></STEXT></SNM></NAM>
</PARTY-US>
<PNC><PDAT>D 2710</PDAT></PNC></PCIT><CITED-BY-EXAMINER/>
</B561>
<B562><NCIT><STEXT><PDAT>Frederick's of Hollywood; p. 37—Lace corset on upper right corner of page, (Item F).* </PDAT></STEXT></NCIT><CITED-BY-OTHER/></B562>
<B562><NCIT><STEXT><PDAT>Frederick's of Hollywood; p. 47—black corset on lower right corner of page, (Item K).* </PDAT></STEXT></NCIT><CITED-BY-OTHER/></B562>
<B562><NCIT><STEXT><PDAT>Frederick's of Hollywood; p. 49—White lace corset, Item F. (middle of page).</PDAT></STEXT></NCIT><CITED-BY-EXAMINER/></B562>
</B560>
<B570>
<B577><PDAT>1</PDAT></B577>
<B578US><PDAT>1</PDAT></B578US>
</B570>
<B580>
<B583US><PDAT>D 2700-714</PDAT></B583US>
<B582><PDAT>D 2738</PDAT></B582>
<B582><PDAT>D24190</PDAT></B582>
<B582><PDAT>D24124</PDAT></B582>
<B582><PDAT>D24126</PDAT></B582>
<B582><PDAT> 2 44</PDAT></B582>
<B582><PDAT> 2 48</PDAT></B582>
<B582><PDAT> 2 67</PDAT></B582>
<B582><PDAT> 2 782</PDAT></B582>
<B582><PDAT> 2 783</PDAT></B582>
<B582><PDAT> 2227</PDAT></B582>
<B582><PDAT> 2228</PDAT></B582>
<B582><PDAT> 2238</PDAT></B582>
<B582><PDAT> 2400</PDAT></B582>
<B582><PDAT> 2402</PDAT></B582>
<B582><PDAT> 2403</PDAT></B582>
<B582><PDAT> 2406</PDAT></B582>
<B582><PDAT> 2408</PDAT></B582>
<B582><PDAT> 2DIG 9</PDAT></B582>
<B582><PDAT> 2300</PDAT></B582>
<B582><PDAT> 2302</PDAT></B582>
<B583US><PDAT>450 1- 5</PDAT></B583US>
<B582><PDAT>450 13</PDAT></B582>
<B582><PDAT>450 30</PDAT></B582>
<B582><PDAT>450 32</PDAT></B582>
<B582><PDAT>450 39</PDAT></B582>
<B582><PDAT>450 41</PDAT></B582>
<B582><PDAT>450 43</PDAT></B582>
<B582><PDAT>450 47</PDAT></B582>
<B582><PDAT>450 48</PDAT></B582>
<B582><PDAT>450 54</PDAT></B582>
<B582><PDAT>450 57</PDAT></B582>
<B582><PDAT>450 58</PDAT></B582>
<B582><PDAT>450 74</PDAT></B582>
<B582><PDAT>450 83</PDAT></B582>
<B582><PDAT>450 86</PDAT></B582>
<B582><PDAT>450 89</PDAT></B582>
<B582><PDAT>450104</PDAT></B582>
<B582><PDAT>450109</PDAT></B582>
<B582><PDAT>450116</PDAT></B582>
</B580>
<B590><B595><PDAT>5</PDAT></B595><B596><PDAT>7</PDAT></B596><B597US/>
</B590>
</B500>
<B600>
<B630><B632><PARENT-US><CDOC><DOC><DNUM><PDAT>29/124868</PDAT></DNUM></DOC></CDOC><PDOC><DOC><DNUM><PDAT>29/107998</PDAT></DNUM><DATE><PDAT>19990720</PDAT></DATE><CTRY><PDAT>US</PDAT></CTRY><KIND><PDAT>00</PDAT></KIND></DOC></PDOC><PSTA><PDAT>03</PDAT></PSTA></PARENT-US></B632></B630>
</B600>
<B700>
<B720>
<B721>
<PARTY-US>
<NAM><FNM><PDAT>Park</PDAT></FNM><SNM><STEXT><PDAT>Kim</PDAT></STEXT></SNM></NAM>
<ADR>
<STR><PDAT>33-102 Hyosung Villa, 63 Chungdam-dong, Kangnam-ku</PDAT></STR>
<CITY><PDAT>Seoul</PDAT></CITY>
<CTRY><PDAT>KR</PDAT></CTRY>
</ADR>
</PARTY-US>
</B721>
</B720>
<B740>
<B741>
<PARTY-US>
<NAM><ONM><STEXT><PDAT>Jacobson Holman, PLLC</PDAT></STEXT></ONM></NAM>
</PARTY-US>
</B741>
</B740>
<B745>
<B746>
<PARTY-US>
<NAM><FNM><PDAT>Celia</PDAT></FNM><SNM><STEXT><PDAT>Murphy</PDAT></STEXT></SNM></NAM>
</PARTY-US>
</B746>
<B748US><PDAT>2914</PDAT></B748US>
</B745>
</B700>
</SDOBI>
<SDODE>
<DRWDESC>
<BTEXT>
<PARA ID="P-00001" LVL="7"><PTEXT><PDAT>FIG. 1 is a perspective view of a corset showing my new design;</PDAT></PTEXT></PARA>
<PARA ID="P-00002" LVL="7"><PTEXT><PDAT>FIG. 2 is a bottom plan view thereof;</PDAT></PTEXT></PARA>
<PARA ID="P-00003" LVL="7"><PTEXT><PDAT>FIG. 3 is a top plan view thereof;</PDAT></PTEXT></PARA>
<PARA ID="P-00004" LVL="7"><PTEXT><PDAT>FIG. 4 is a front elevational view thereof;</PDAT></PTEXT></PARA>
<PARA ID="P-00005" LVL="7"><PTEXT><PDAT>FIG. 5 is a rear elevational view thereof;</PDAT></PTEXT></PARA>
<PARA ID="P-00006" LVL="7"><PTEXT><PDAT>FIG. 6 is a left side view thereof; and,</PDAT></PTEXT></PARA>
<PARA ID="P-00007" LVL="7"><PTEXT><PDAT>FIG. 7 is a right side view thereof.</PDAT></PTEXT></PARA>
<PARA ID="P-00008" LVL="7"><PTEXT><PDAT>The broken lines shown throughout the views are understood to represent stitching.</PDAT></PTEXT></PARA>
</BTEXT>
</DRWDESC>
</SDODE>
<SDOCL>
<CL>
<CLM ID="CLM-00001">
<PARA ID="P-00009" LVL="7"><PTEXT><PDAT>The ornamental design for a corset, as shown and described.</PDAT></PTEXT></PARA>
</CLM>
</CL>
</SDOCL>
<SDODR ID="DRAWINGS">
<EMI ID="EMI-D00000" ALT="embedded image" FILE="USD0453058-20020129-D00000.TIF"/>
<EMI ID="EMI-D00001" ALT="embedded image" FILE="USD0453058-20020129-D00001.TIF"/>
<EMI ID="EMI-D00002" ALT="embedded image" FILE="USD0453058-20020129-D00002.TIF"/>
<EMI ID="EMI-D00003" ALT="embedded image" FILE="USD0453058-20020129-D00003.TIF"/>
<EMI ID="EMI-D00004" ALT="embedded image" FILE="USD0453058-20020129-D00004.TIF"/>
<EMI ID="EMI-D00005" ALT="embedded image" FILE="USD0453058-20020129-D00005.TIF"/>
</SDODR>
</PATDOC>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE PATDOC SYSTEM "ST32-US-Grant-025xml.dtd" [
<!ENTITY USD0453059-20020129-D00000.TIF SYSTEM "USD0453059-20020129-D00000.TIF" NDATA TIF>
<!ENTITY USD0453059-20020129-D00001.TIF SYSTEM "USD0453059-20020129-D00001.TIF" NDATA TIF>
<!ENTITY USD0453059-20020129-D00002.TIF SYSTEM "USD0453059-20020129-D00002.TIF" NDATA TIF>
<!ENTITY USD0453059-20020129-D00003.TIF SYSTEM "USD0453059-20020129-D00003.TIF" NDATA TIF>
<!ENTITY USD0453059-20020129-D00004.TIF SYSTEM "USD0453059-20020129-D00004.TIF" NDATA TIF>
<!ENTITY USD0453059-20020129-D00005.TIF SYSTEM "USD0453059-20020129-D00005.TIF" NDATA TIF>
]>
<PATDOC DTD="2.5" STATUS="Build 20020101">
<SDOBI>
<B100>
to produce a CSV with certain values separated by carats. I used code for similar XML documents that I attempted to modify for this purpose as below:
import os
import io
from bs4 import BeautifulSoup
import csv
import requests
import re
directory_in_str = 'directorypath'
directory = os.fsencode(directory_in_str)
results_info = []
results_assignees = []
results_citations = []
results_inventors = []
results_lawyers = []
results_lawfirms = []
results_secondary = []
for file in os.listdir(directory):
filename = os.fsdecode(file)
full_name = directory_in_str + filename
handler = open(full_name).read()
soup = BeautifulSoup(handler, 'html.parser')
full_patent=soup.find_all('SDOBI')
for full_pat in full_patent:
pub_ref=full_pat.find_all('B100')
for patent in pub_ref:
doc_num = patent.find_all('DNUM')
doc_num2 = doc_num.findall('PDAT')
grant_date = patent.find_all('DATE')
grant_date2 = grant_date.findall('PDAT')
doc_num_str = re.search('>(.*)<', str(doc_num2))
try:
doc_str = doc_num_str.group(1)
except:
doc_str = ' '
while (len(doc_str) > 1 and len(doc_str) < 8):
doc_str = doc_str[0] + '0' + doc_str[1:]
grant_date_str = re.search('>(.*)<', str(grant_date2))
try:
grantd_str = grant_date_str.group(1)
except:
grantd_str = ' '
try:
granty_str = grant_date_str.group(1)[0:4]
except:
granty_str = ' '
claims = full_pat.find_all('CLM')
num_claims = 0
for claim in claims:
num_claims +=1
claim_str = str(num_claims) #no explicit number of claims so changed to iterator
try:
num_claims = claim_str.group(1)
except:
num_claims = ' '
app_ref=full_pat.find_all('B200')
for app in app_ref:
app_date = app.find_all('DATE')
app_date2 = app_date.find_all('PDAT')
app_num = app.find_all('DNUM')
app_num2 = app_num.find_all('PDAT')
app_num_str = re.search('>(.*)<', str(app_num2))
try:
app_numb = app_num_str.group(1)
except:
app_numb = ' '
app_date_str = re.search('>(.*)<', str(app_date2))
try:
appl_date = app_date_str.group(1)
except:
appl_date = ' '
try:
appl_year = app_date_str.group(1)[0:4]
except:
appl_year = ' '
nat_class=full_pat.find_all('B520')
for mclass in nat_class:
classes = mclass.find_all('PDAT') #should be a list of strings
main_class=classes[0] #should return the first thing in the list
main_class_str = re.search('>(.*)<', str(main_class).replace(" ", ""))
try:
prim_class = main_class_str.group(1)[0:2]
except:
prim_classs = ' '
try:
sec_class = main_class_str.group(1)[2:]
except:
sec_class = ' '
secondary_classes = classes[1:]
secondary_classes_str = str(secondaryclasses).replace('<PDAT>', "")
secondary_classes_str = secondary_classes_str.replace('</PDAT>', "^")
secondary_classes_str = secondary_classes_str.replace(' ', "")+grant_date_str.group(1)+"^" +grant_date_str.group(1)[0:4]+"^" +app_date_str.group(1)+"^" +app_date_str.group(1)[0:4]+"^"+main_class_str.group(1)[0:2] + "^" +main_class_str.group(1)[2:])
single_result_info = [doc_str, num_claims, app_numb, grantd_str, granty_str, appl_date, appl_year, prim_class, sec_class]
single_result_info_str = "^".join(str(a) for a in single_result_info)
single_result_secondary = [doc_str, secondary_classes_str]
single_result_secondary_str = "^".join(str(a) for a in single_result_secondary)
results_secondary += [single_result_secondary_str]
with io.open('test_design_patents_info_2002.csv', 'w', encoding='utf-8') as f:
#Write header
f.writelines("pat num^claims^app num^grant date^grant year^app date^app year^primary classification^secondary classification"'\n')
for item in results_info:
f.writelines('%s\n' % item)
with io.open('test_design_patents_sec_class_2002.csv', 'w', encoding='utf-8') as f:
#Write header
f.writelines("pat num^secondary classifications"'\n')
for item in results_secondary:
f.writelines('%s\n' % item)
However, it is not returning anything--I'm not sure if maybe I am misunderstanding how SGML works, or if there is an easier way to do it. The thing that (I think) is throwing me the most is how everything is wrapped several tags deep that (I assume?) I have to drill down through one at a time.
Any help would be appreciated.