0

I am using jsoup for HTML table parsing. Below is the scenario where I have to identify the correct segment. The process for identifying the correct segment is:

Wherever I find the keyword -> ABC , I have to iterate till I get the <tr> HTML tag(For table identification) then Check if it contains all the 4 Keywords ForVote,AgainstVote,Absent,NoVotes in the first row (if not then go to next occurrence of keyword-> ABC ) and follow the same process. Once I get the 4 vote keywords to match inside table, then I can extract the numbers in the table.

The problem I am stuck in is: If there is just one occurrence of keyword ABC, I am able to parse. But not able to when there is more than one occurrence of ABC which results in the wrong segment for parsing.

My Sample HTML Code to be parsed is:

<!DOCTYPE html>
<html>
    <head>
        <meta charset="ISO-8859-1">
            <title>Correct segment to be identified for parsing table </title>
        </head>
        <body>
            <div>ABC Keyword</div>
            <!--First Occurrence of Keyword(Not a correct segment as the table below doesn't have the correct headers)-->
            <div> asd xyz asdf</div>
        </br>
        <table border="1px">
            <tbody>
                <tr>
                    <td>For Vote</td>
                    <td>Against Vote</td>
                    <td>Some Header1</td>
                    <td>Some Header2</td>
                </tr>
                <tr>
                    <td>1</td>
                    <td>1</td>
                    <td>2</td>
                    <td>3</td>
                </tr>
            </tbody>
        </table>
        <div>
            <p>Another 'ABC' is the keyword in the document</p>
            <!--2nd Occurrence, but not correct segment-->
        </div>
        <div> asd xyz jskadl</div>
    </br>
    <div> ABC is keyword  </div>
    <!-- 3rd Occurrence, this is the correct segment below which the required table with keywords ForVote, AgainstVote, Absent, NoVotes are found whose values are to be parsed-->
</br>undefined</br>undefined<div>
<table border="1px">
    <tbody>
        <tr>
            <td>ForVote</td>
            <td>AgainstVote</td>
            <td>Absent</td>
            <td>NoVotes</td>
        </tr>
        <tr>
            <td>10</td>
            <td>5</td>
            <td>1</td>
            <td>0</td>
        </tr>
    </tbody>
</table>
</div>
<p>Doc ends</p>
</body>
</html>

Java Code

My logic is to iterate until I find ABC. Find the element which encloses ABC, add a class=tagid to it. select(div.tagid). Then find the <tr> tag. Find if the table is in expected format i.e. is isVertical=0 in code. Then check if in first row all four keywords are present. If yes parse the numerical values. Doesn't work in case of multiple occurrences of "ABC" :-(

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
final static String regexPattern1 = "ABC";
final static String tableregexPattern1 = "ForVotes";
final static Pattern tPat1 = Pattern.compile(tableregexPattern1);

//a function for finding occurrence of ABC  
public static Element htmlIterator(String HTMLTags, String regexPattern) throws IOException {       
pattern = Pattern.compile(regexPattern1, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);// compiles the matching regex
    for (String tag : HTMLTags) {
        Elements tagData = doc.select("div");
        for (Element element1 : tagData) {
            if (element1.select("div").text().trim().equals("")) {
                continue;
            } else {

                final String dataParsedInTag = element1.select("div").text().trim();
                final String dataParsedInTagClean = dataParsedInTag.replace(",", "");
                final Matcher matcher = pattern.matcher(dataParsedInTagClean);
                b1 = matcher.find();
                if (b1) {
                    System.out.println(b1 + " matched");
                    return element1;
                        }
                   }
            }
    }
    public static void main{  

doc=Jsoup.parseHTML(input);     //input is above given HTML snippet
element1 = htmlIterator(div, regexPattern1);// returns the element which has "ABC"
    Elements ele = element1.getElementsMatchingText(pattern);
    if(ele != null) {
        Elements manipulatedElement = ele.addClass(tagid);//attach class= tagid to the identified div
        //iterate till I get <tr>
        while (true) {
                resultTableHTML = doc.selectFirst(div+"."+tagid).nextElementSibling();
                resultTableInChar = doc.selectFirst(div+"."+tagid).nextElementSibling().toString();
                nextResultTable = doc.selectFirst(div+"."+tagid).nextElementSibling();
                // System.out.println(resultTableInChar);
                while (!resultTableInChar.contains("tr")) {
                    resultTableInChar = nextResultTable.nextElementSibling().toString();
                    nextResultTable = nextResultTable.nextElementSibling();// for continuous iteration
                    System.out.println("-->Iterating" + nextResultTable);
                }
                break;
        
    }
    //check if the table is having the keyword ForVotes and is int the expected tabular format that is an isVertical=0
    Elements rows = nextResultTable.select("tr");// just select the rows and check if its empty or not
            for (Element rowElement : rows) {
                Matcher mat1 = tPat1.matcher(rowElement.text());
                boolean isTablewithFirstHeaderKeyword =  mat1.find();
                if (!(rowElement.text().isEmpty()) && (isTablewithFirstHeaderKeyword)  ) {
                    String tmpLines[] = rowElement.text().trim().replaceAll(",", "").split(" ");
                    String tmpRowElement = rowElement.text().trim().replaceAll(",", "");
                    Matcher mat5 = tPat5.matcher(tmpRowElement);
                    boolean typeVerticaldetected = mat5.find();//for detecting the numerical values
                    if (typeVerticaldetected) {
                        isVertical = 1;
                        break;
                    } else {
                        isVertical = 0;
                        break;
                    }

                }

            }
            if (isVertical == 0) {
                System.out.println("Horizontal Table Identified. Start Parsing.....");
                rows = nextResultTable.select("tr");
                for (Element rowElement : rows) {  

//if row isn't empty then find all 4 keywords
                    if (!rowElement.text().isEmpty()) {
                        Matcher mat1 = tPat1.matcher( rowElement.toString());//tpat1 is regex for ForVotes inside table row, CAN use contains for now
                        Matcher mat2 = tPat2.matcher( rowElement.toString());//tpat2 is regex for AgainstVotes inside table row
                        Matcher mat3 = tPat3.matcher( rowElement.toString());//tpat3 is regex for Absent inside table row
                        Matcher mat4 = tPat4.matcher( rowElement.toString());//tpat4 is regex for NoVotes inside table row
                        boolean hasTableHeaderKeywords = mat1.find() && mat2.find() && mat3.find() && mat4.find();
                        System.out.println(mat1.find()+";"+mat2.find()+";"+mat3.find()+";"+mat4.find()+";");
                        
                        if(hasTableHeaderKeywords) {
                            rowElement = rowElement.nextElementSibling();
                            
                            
                            
                            String tmpLines[] = rowElement.text().trim().replaceAll(",", "").split(" ");
                            
                            Matcher mat5 = tPat5.matcher(tmpLines[0]);//tpat5 is regex for numerical digits inside table 2nd row
                            Matcher mat6 = tPat5.matcher(tmpLines[1]);
                            Matcher mat7 = tPat5.matcher(tmpLines[2]);
                            Matcher mat8 = tPat5.matcher(tmpLines[3]);
                            
                            System.out.println(mat5.matches());
                            System.out.println(mat6.matches());
                            System.out.println(mat7.matches());
                            System.out.println(mat8.matches());
                            
                            
                            if (mat5.matches())
                            {
                            for(int index=0 ; index < tmpLines.length ; index++) {
                                System.out.println("Value at index-> "+index+" is : "+tmpLines[index]);
                            }
                            
                            System.out.println("For : "+ tmpLines[0] + "|" +"Against : "+ tmpLines[1] + "|" + "Abstain : "+tmpLines[2] + "|" +"Broker Non-Votes : "+ tmpLines[3]);
                            break;
                            }
                            else {
                                
                                System.out.println("Numerical Values weren't found in expected range for"+tmpLines);
                            }
                        }
Jason Aller
  • 3,541
  • 28
  • 38
  • 38
  • I have tried to post in as detail as possible , as its my first time - post in stackoverflow. And I have been stuck in this finding no other solution for overcoming the multiple occurrence of keyword issue. – sumit kumar sahoo Feb 25 '19 at 13:26
  • Just to clarify, I can not remove any HTML code while iterating ,in case the first occurrence of specified keyword doesn't give me the table mentioned while finding ``. – sumit kumar sahoo Feb 25 '19 at 15:50
  • if anyone has inputs to simplify this , kindly suggest!! – sumit kumar sahoo Mar 15 '19 at 10:00

0 Answers0