2

I have several bank statements from our users. I am trying to figure out a way to parse the rows of transactions. I have used PDFBox previously using TextArea, TextStripper, but i am not sure how to proceed with bank statements since they will have an undetermined number of rows and the rows may or maynot be of fixed size.

Amit
  • 3,952
  • 7
  • 46
  • 80
  • You will have to do more in detail analysis of the document types you'll retrieve and do more dynamic extraction than just taking the text from some area. – mkl Apr 07 '18 at 21:35
  • @mkl what kind of dynamic extraction would be required? I have tried using tabula but the results are suboptimal at best – Amit Apr 08 '18 at 13:59
  • You have to identify the start and the end of the tabular data. You can do that e.g. by means of standard phrases before/after the table or by specific drawings (lines, backgrounds,...). If there are no such markers across all your documents, you have to categorise the documents in separate groups. – mkl Apr 08 '18 at 15:30

2 Answers2

4

i wrote just such a parser to parse our chase pdf credit card statements, to speed up the tax-preparation time, with the help of an open source project called Apache Tika.

just need to include tika and pdf parser in your pom.xml dependencies:

        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-core</artifactId>
            <version>1.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-parsers</artifactId>
            <version>1.17</version>
        </dependency>

the PDF extractor is fairly straightforward also:

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;


public class PdfExtractor {
    private static Logger logger = LoggerFactory.getLogger(PdfExtractor.class);

    public static void main(String args[]) throws Exception {
        StopWatch sw = new StopWatch();
        List<String> files = new ArrayList<>();
        files.add("C:/Users/m/Downloads/20170115.pdf");
        files.add("C:/Users/m/Downloads/20170215.pdf");
        files.add("C:/Users/m/Downloads/20170315.pdf");
        files.add("C:/Users/m/Downloads/20170415.pdf");
        files.add("C:/Users/m/Downloads/20170515.pdf");
        files.add("C:/Users/m/Downloads/20170615.pdf");
        files.add("C:/Users/m/Downloads/20170715.pdf");
        files.add("C:/Users/m/Downloads/20170815.pdf");
        files.add("C:/Users/m/Downloads/20170915.pdf");
        files.add("C:/Users/m/Downloads/20171015.pdf");
        files.add("C:/Users/m/Downloads/20171115.pdf");
        files.add("C:/Users/m/Downloads/20171215.pdf");
        files.add("C:/Users/m/Downloads/20180115.pdf");
        InputStream is;
        List<ChasePdfParser.ChaseRecord> full = new ArrayList<>();
        for (String fileName : files) {
            logger.info("Now processing " + fileName);
            is = new FileInputStream(fileName);
            ContentHandler contenthandler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            PDFParser pdfparser = new PDFParser();
            pdfparser.parse(is, contenthandler, metadata, new ParseContext());
            String data = contenthandler.toString();
            List<ChasePdfParser.ChaseRecord> chaseRecords = ChasePdfParser.parse(data);
            full.addAll(chaseRecords);
            is.close();
        }
        logger.info("Total processing time: " + PrettyPrinter.toMsSoundsGood(sw.getTime()));
        full.forEach(cr -> System.err.println(cr.date + "|" + cr.desc + "|" + cr.amt));
    }
}

The line parser also fairly straight-forward, since each line has all the necessary info, it's easy to parse it:

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class ChasePdfParser {
    private static Logger logger = LoggerFactory.getLogger(ChasePdfParser.class);

    private static int FOR_TAX_YEAR = 2017;
    private static String YEAR_EXTENSION = "/" + FOR_TAX_YEAR;
    private static DateTimeFormatter check = DateTimeFormatter.ofPattern("MM/dd/uuuu");
    private static List<String> exclusions = new ArrayList<>(Arrays.asList("Payment Thank You", "AUTOMATIC PAYMENT"));

    public static List<ChaseRecord> parse(String data) {
        List<ChaseRecord> l = new ArrayList<>();
        for (String line : data.split("\n")) {
            if (line.isEmpty()) continue;
            String[] split = line.split("\\s");
            if (split == null || split.length == 0) continue;
            String test = split[0];
            if (!isMMDD(test)) continue;
            if(skip(line)) continue;
            if (split.length < 4) continue;
            ChaseRecord cr = new ChaseRecord();
            cr.date = extractDate(test);
            try {
                String last = split[split.length - 1];
                last = last.replaceAll(",", "");
                cr.amt = Double.parseDouble(last);
            } catch (NumberFormatException e) {
                e.printStackTrace();
            }
            cr.desc = String.join(" ", Arrays.copyOfRange(split, 1, split.length - 1));
            cr.desc = cr.desc.replaceAll("\\s\\s+", " ");
            l.add(cr);
        }
        return l;
    }

    private static boolean skip(String s) {
        if (s == null || s.isEmpty()) {
            return true;
        }
        for (String e : exclusions) {
            if (s.contains(e)) {
                return true;
            }
        }
        return false;
    }

    protected static LocalDate extractDate(String s) {
        if (!isMMDD(s)) {
            return null;
        }
        LocalDate localDate = LocalDate.parse(s + YEAR_EXTENSION, check);
        return localDate;
    }

    public static boolean isMMDD(String s) {
        if (s == null || s.isEmpty() || s.length() != 5) {
            return false;
        }
        try {
            s += YEAR_EXTENSION;
            LocalDate.parse(s, check);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    public static class ChaseRecord {
        public LocalDate date;
        public String desc;
        public Double amt;

        @Override
        public String toString() {
            return "ChaseRecord{" +
                    "date=" + date +
                    ", desc='" + desc + '\'' +
                    ", amt=" + amt +
                    '}';
        }
    }
}
user3892260
  • 965
  • 2
  • 10
  • 19
0

Late to the party. You can also use pdftotext as a workaround. Everyone once in a great while it will miss out an amount of currency, particularly in the upper right of the table.

As you'd expect, you'll join the text on newlines and then start chopping the lines into lists, thence to write it to a tsv. The approach looks like this:

HTH.

import csv
import pdftotext
import re
from datetime import *
import os
import pandas as pd

# compile directory ref:
path='path to directory'
directory = os.fsencode(path)

# https://stackoverflow.com/questions/42202872/how-to-convert-list-to-row-dataframe-with-pandas
column_list = ['filesource','filedate','eventdate','description','bankcategory','amount']

filelist=[]

# an example of how to scrape chase statement pdf into list of lists:

def process_pdf_data(filename,filesource,filedate):

    # trying with pdftotext

    # print('starting pdf content scrape', file)

    with open(filename, "rb") as f:
        pdf = pdftotext.PDF(f)

    pdf_join="\n".join(pdf)
    pdf_array=pdf_join.split('\n')

    # print(pdf_array)

    startint=0
    line=''

    # at this point, the pdf_array is just a list of strings read serially from the pdf in succession down the page.

    while line!='Account activity' and startint<=1000:
        line=pdf_array[startint]
        startint+=1

    startint-=1 # bc it still gets incremented on exit above
    # drop data before 'Account activity' as we won't need it.
    del pdf_array[:startint]

    # print(pdf_array)

    # set pattern for date detection
    # https://www.programiz.com/python-programming/regex
    # https://docs.python.org/3/library/re.html

    pattern=re.compile("^([A-Z]|[a-z]){3} [0-9]{1,2}, [0-9]{4}$") # test pattern for regex eval of date

    startint=0 # use for test exit limits

    # print('entering pdf content eval', file)

    while startint<len(pdf_array):

    # if string has certain date format:
    # if it doesn't have this conversion then it's suspect and maybe write it to log
        # print(startint,pdf_array[startint])

        if pattern.match(pdf_array[startint])!=None:

        #   transform it to date
        #   https://docs.python.org/3/library/datetime.html
            datestr=datetime.strptime(pdf_array[startint], '%b %d, %Y').date().isoformat()
            # print('pattern match',datestr)        


        #   look ahead and keep next few strings:
            description=pdf_array[startint+2]
            bankcategory=pdf_array[startint+4]
            amount=''
            if '$' in pdf_array[startint+6]:
                amount=pdf_array[startint+6] # will mess with $/string type conversion downstream, when combining sources

        #   write to list of lists

            templist=[]
            templist.append(filesource)
            templist.append(filedate)
            templist.append(datestr)
            templist.append(description)
            templist.append(bankcategory)
            templist.append(amount)

            # print(templist)
            filelist.append(templist)

        startint+=1

process_pdf_data(,,)

ouonomos
  • 700
  • 1
  • 9
  • 25