1

What is the most efficient way for python to drill down a nested list of 'PyPDF2.generic.Destination' and list objects to get to the last instance of a list? (to get to all levels of a PDF outline and its page number).

pdfread = p2.PdfFileReader(list_files)
s = pdfread.outlines
chapters = []
pages = []
levels = []
files = []

for x in s:
    files.extend([x])
    if isinstance(x, (int, list, float, complex)):
        for y in x:
            if isinstance(y, (int, list, float, complex)):
                for z in y:
                    if isinstance(z, (int, list, float, complex)): 
                        for a in z:
                            if isinstance(a, (int, list, float, complex)): 
                                for c in a:
                                    if isinstance(c, (int, list, float, complex)): 
                                        print(c.title)
                                    else:
                                        chapters.extend([c.title])
                                        pages.extend([pdfread.getDestinationPageNumber(c)])
                                        levels.extend(['5'])

                            else:
                                chapters.extend([a.title])
                                pages.extend([pdfread.getDestinationPageNumber(a)])
                                levels.extend(['4'])

                    else:
                        chapters.extend([z.title])
                        levels.extend(['3'])
                        pages.extend([pdfread.getDestinationPageNumber(z)])
            else:
                chapters.extend([y.title])
                levels.extend(['2'])
                pages.extend([pdfread.getDestinationPageNumber(y)])
    else:
        chapters.extend([x.title])
        levels.extend(['1'])
        pages.extend([pdfread.getDestinationPageNumber(x)])

df = pd.DataFrame({'chapters': chapters, 'pages' : pages, 'levels' : levels})
df.head(44)
Margosia
  • 27
  • 4

1 Answers1

2

Just define a recursive function:

def nested_is_instance_check(s, level, pages):
    level +=1
    if isinstance(x, (int, list, float, complex)):
        for x in s:
            nested_is_instance_check(s, level, pages)
    else:
        chapters.extend([y.title])
        levels.extend(['level'])
        pages.extend([pdfread.getDestinationPageNumber(s)])

nested_is_instance_check calls itself for every level in your file.

Sebastian R.
  • 412
  • 4
  • 10