I have created a solution using python which extracts highlighted portions from the PDF using pymupdf
and fitz
.
This is the code for the same.
def _parse_highlight(annot: fitz.Annot, wordlist: List[Tuple[float, float, float, float, str, int, int, int]]) -> str:
points = annot.vertices
quad_count = int(len(points) / 4)
sentences = []
for i in range(quad_count):
# where the highlighted part is
r = fitz.Quad(points[i * 4 : i * 4 + 4]).rect
words = [w for w in wordlist if fitz.Rect(w[:4]).intersects(r)]
sentences.append(" ".join(w[4] for w in words))
sentence = " ".join(sentences)
string_split = sentence.split(",")
sent = " ".join(sorted(set(string_split), key=string_split.index))
output = re.sub(r'\b(\d+(?:\.\d+)?)\b', r'\1,', sent)
return output
def handle_page(page):
wordlist = page.getText("words") # list of words on page
wordlist.sort(key=lambda w: (w[3], w[0])) # ascending y, then x
highlights = []
annot = page.firstAnnot
while annot:
if annot.type[0] == 8:
highlights.append(_parse_highlight(annot, wordlist))
annot = annot.next
return highlights
def main():
filepath = [file for file in glob.glob("Folder/*.pdf")]
for file in filepath:
doc = fitz.open(file)
#print(file)
highlights = []
for page in doc:
highlights += handle_page(page)
print(highlights)
This when run on 2 PDFs gives me a result like this:-
list1: ['Ashmore 2, 554, 1.06, Close Brothers 2, 704, 1.12,', 'Close Brothers 2, 704, 1.12]
list2: ['HDFC Bank Ltd. 6.98, ICICI Bank Ltd. 4.82, Infosys Ltd. 4.37']
In list1, The no of columns which was highlighted in 1PDF was 3 In list2, The no of columns which was highlighted in 2PDF was 2.
If I have to build a dataframe from this. How do I develop a logic to build a generalized one(if a pdf is having 4 columns so then in dataframe as well it should be 4) So that the dataframe columns will be in sync with the list above? This is how I have built the dataframe logic
def main():
try:
filepath = [file for file in glob.glob("Folder/*.pdf")]
my_df = pd.DataFrame()
for file in filepath:
doc = fitz.open(file)
#print(file)
highlights = []
for page in doc:
highlights += handle_page(page)
#print(highlights)
for i in range(len(highlights)):
highlights_alt = highlights[i].split(',')
#print(highlights_alt)
df = pd.DataFrame(highlights_alt, columns=['Security Name'])
#print(df.columns.tolist())
df[['Name', 'Value']] = df['Name'].str.rsplit(n=1, expand=True)
df.drop_duplicates(keep='first', inplace=True)
print(df.head())
print(df.shape)
This code fails/doesn't work properly if columns highlighted from pdfs are more than 2. How do I generalize this? Please help!!
EDIT:
Using Laurent's code I get this output which is not expected.
This is the list that I get when I print highlights
=
['DWF 1, 340, 0.84,', 'Equiniti 491, 0.31,', 'Inchcape 947, 0.59,', 'Speedy Hire 1, 054, 0.66,']
This is the highlights_alt I get after printing it highlights_alt
=
[['D'], ['W'], ['F'], [' '], ['1'], ['', ''], [' '], ['3'], ['4'], ['0'], ['', ''], [' '], ['0'], ['.'], ['8'], ['4'], ['', '']]
[['E'], ['q'], ['u'], ['i'], ['n'], ['i'], ['t'], ['i'], [' '], ['4'], ['9'], ['1'], ['', ''], [' '], ['0'], ['.'], ['3'], ['1'], ['', '']]
[['I'], ['n'], ['c'], ['h'], ['c'], ['a'], ['p'], ['e'], [' '], ['9'], ['4'], ['7'], ['', ''], [' '], ['0'], ['.'], ['5'], ['9'], ['', '']]
[['S'], ['p'], ['e'], ['e'], ['d'], ['y'], [' '], ['H'], ['i'], ['r'], ['e'], [' '], ['1'], ['', ''], [' '], ['0'], ['5'], ['4'], ['', ''], [' '], ['0'], ['.'], ['6'], ['6'], ['', '']]
This is the dataframe that I get df
=
Name Value_0
0 D 0
1 W 0
2 F 0
3 0
4 1 0
Name Value_0
0 E 0
1 q 0
2 u 0
3 i 0
4 n 0
Name Value_0
0 I 0
1 n 0
2 c 0
3 h 0
5 a 0
Name Value_0
0 S 0
1 p 0
2 e 0
4 d 0
5 y 0