0

Given a HTML as string like this.

<p><strong>This is a text Message.</strong></p>
<ul>
    <li>UL 1</li>
    <li><strong>UL&nbsp;</strong>2</li>
    <li><em>UL 3</em></li>
</ul>
<ol>
    <li style="font-weight: bold;"><strong>First statement</strong></li>
    <li><strong>Second&nbsp;</strong>Statement</li>
    <li>Third <strong>Statement</strong></li>
</ol>
<p>This is another <em>text </em>message.</p>

I want to format this into an excel textbox. It will look like this. enter image description here

I tried writing some basic code to extract the right values. It didn't work as expected.

What the code does is the following.

  1. Get all the children elements from the HTML.
  2. Process them in order and extract the text from it.

Problem comes due to the nested tags strong within p etc....

public void CreateHtmlToRichText() {
//        This is a text Message.
//
//        UL 1
//        UL 2
//        UL 3
//        First statement
//        Second Statement
//        Third Statement
//
//        This is another text message.

        String htmlString = "<p><strong>This is a text Message.</strong></p>\n" +
                "<ul>\n" +
                "    <li>UL 1</li>\n" +
                "    <li><strong>UL&nbsp;</strong>2</li>\n" +
                "    <li><em>UL 3</em></li>\n" +
                "</ul>\n" +
                "<ol>\n" +
                "    <li style=\"font-weight: bold;\"><strong>First statement</strong></li>\n" +
                "    <li><strong>Second&nbsp;</strong>Statement</li>\n" +
                "    <li>Third <strong>Statement</strong></li>\n" +
                "</ol>\n" +
                "<p>This is another <em>text </em>message.</p>";
        System.out.println(htmlString);
        Document document = Jsoup.parse(htmlString);
        Elements elements = document.body().children().select("*");
        System.out.println("****************************");
        System.out.println(elements.size());
        Map<Integer, String> paragraphMap = new HashMap<>();
        Set<String> newLineSet = new HashSet<>();
        newLineSet.add("p");
        newLineSet.add("ol");
        newLineSet.add("ul");
        newLineSet.add("li");
        newLineSet.add("br");

        int lineNumber = 0;
        for (Element element : elements) {
            String ownText = element.ownText();
            String tagName = element.tagName();

            System.out.println("added " + lineNumber + ", " + ownText);
            if (newLineSet.contains(tagName)) {
                lineNumber++;
            }
            paragraphMap.put(lineNumber, paragraphMap.getOrDefault(lineNumber, "") + " " + ownText);

            System.out.println("*********************");
            System.out.println(element);
            System.out.println("Tag : " + tagName);
            System.out.println("Own Text : " + ownText);
            System.out.println("*********************");
        }

        System.out.println(paragraphMap);

        for (int line = 0; line <= lineNumber; line++) {
            if (paragraphMap.containsKey(line)) {
                System.out.println(paragraphMap.get(line).strip());
            }
        }
    }

Outputs:

This is a text Message.

UL 1
2 UL
UL 3

First statement
Statement Second
Third Statement
This is another message. text
Sam Si
  • 163
  • 13

0 Answers0