0

I have this specific flat input HTML structure:

<!DOCTYPE html>
<html>
  <head>
    <title>Article <b>bold</b> title</title>
  </head>
  <body>
    <article>
      <h1 class="h-title"><span class="span-title">1 </span> Title 1 with some <sup>sup</sup> elements.</h1>
      <p>Some <b>bold</b> text for 1.</p>
      <p>Some more <b>bold</b> text for 1.</p>
      <h1 class="h-title"><span class="span-title">2 </span> Title 2 with some <sup>sup</sup> elements.</h1>
      <ul>
        <li>The first list item.</li>
        <li>The second list item with <i>italic</i> text.</li>
      </ul>
      <p>Some <b>bold</b> text for 2.</p>
      <h2 class="h-title"><span class="span-title">2.1</span> Title 2.1 with some <sup>sup</sup> elements.</h2>
      <p>Some <b>bold</b> text for 2.1.</p>
      <h2 class="h-title"><span class="span-title">2.2</span> Title 2.2 with some <sup>sup</sup> elements.</h2>
      <p>Some <b>bold</b> text for 2.2.</p>
      <h3 class="h-title"><span class="span-title">2.2.1</span> Title 2.2.1 with some <sup>sup</sup> elements.</h3>
      <p>Some <b>bold</b> text for 2.2.1.</p>
      <h3 class="h-title"><span class="span-title">2.2.2</span> Title 2.2.2 with some <sup>sup</sup> elements.</h3>
      <p>Some <b>bold</b> text for 2.2.2.</p>
      <h2 class="h-title"><span class="span-title">2.3</span> Title 2.3 with some <sup>sup</sup> elements.</h2>
      <p>Some <b>bold</b> text for 2.3.</p>
      <h1 class="h-title"><span class="span-title">3</span> Title 3 with some <sup>sup</sup> elements.</h1>
      <p>Some <b>bold</b> text for 3.</p>
    </article>
  </body>
</html>

I would need to create a nested output XML structure as below:

<?xml version="1.0" encoding="UTF-8"?>
<xml>
  <front type="head">
    <title>Article <b>bold</b> title</title>
  </front>
  <body>
    <sec id="s1" sec-type="Title 1 with some sup elements.">
      <label>1</label>
      <title>Title 1 with some <sup>sup</sup> elements.</title>
      <p>Some <b>bold</b> text for 1.</p>
      <p>Some more <b>bold</b> text for 1.</p>
    </sec>
    <sec id="s2" sec-type="Title 2 with some sup elements.">
      <label>2</label>
      <title>Title 2 with some <sup>sup</sup> elements.</title>
      <list list-type="bullet">
        <list-item>The first list item.</list-item>
        <list-item>The second list item with <i>italic</i> text.</list-item>
      </list>
      <p>Some <b>bold</b> text for 2.</p>
      <sec id="s2.1" sec-type="Title 2.1 with some sup elements.">
        <label>2.1</label>
        <title>Title 2.1 with some <sup>sup</sup> elements.</title>
        <p>Some <b>bold</b> text for 2.1.</p>
      </sec>
      <sec id="s2.2" sec-type="Title 2.2 with some sup elements.">
        <label>2.2</label>
        <title>Title 2.2 with some <sup>sup</sup> elements.</title>
        <p>Some <b>bold</b> text for 2.2.</p>
        <sec id="s2.2.1" sec-type="Title 2.2.1 with some sup elements.">
          <label>2.2.1</label>
          <title>Title 2.2.1 with some <sup>sup</sup> elements.</title>
          <p>Some <b>bold</b> text for 2.2.1.</p>
        </sec>
        <sec id="s2.2.2" sec-type="Title 2.2.2 with some sup elements.">
          <label>2.2.2</label>
          <title>Title 2.2.2 with some <sup>sup</sup> elements.</title>
          <p>Some <b>bold</b> text for 2.2.2.</p>
        </sec>
      </sec>
      <sec id="s2.3" sec-type="Title 2.3 with some sup elements.">
        <label>2.3</label>
        <title>Title 2.3 with some <sup>sup</sup> elements.</title>
        <p>Some <b>bold</b> text for 2.3.</p>
      </sec> 
    </sec>
    <sec id="s3" sec-type="Title 3 with some sup elements.">
      <label>3</label>
      <title>Title 3 with some <sup>sup</sup> elements.</title>
      <p>Some <b>bold</b> text for 3.</p>
    </sec>
  </body>
</xml>

So far, I have produced this XSLT transformation below (h1-h6 section needs to be improved I believe):

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:xs="http://www.w3.org/2001/XMLSchema"
  exclude-result-prefixes="xs"
  version="2.0">
  
  <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
  
  <!-- all -->
  <xsl:template match="node()|@*">
    <xsl:copy>
      <xsl:apply-templates select="node()|@*"/>
    </xsl:copy>
  </xsl:template>
  
  <!-- html / xml -->
  <xsl:template match="html">
    <xml>
      <xsl:apply-templates select="node()|@*"/>
    </xml>
  </xsl:template>
  
  <!-- head / front -->
  <xsl:template match="head">
    <front type="head">
      <xsl:apply-templates select="node()|@*"/>
    </front>
  </xsl:template>
  
  <!-- article / -->
  <xsl:template match="article">
    <xsl:apply-templates select="node()|@*"/>
  </xsl:template>
  
  <!-- h1-h6 / sec -->
  <xsl:template match="h1[@class='h-title']">
    <xsl:variable name="secId" select="normalize-space(span)"/>
    <xsl:variable name="secType" select="substring-after(.,' ')"/>
    <sec>
      <xsl:attribute name="id" select="normalize-space(concat('s', $secId))"/>
      <xsl:attribute name="sec-type" select="$secType"/>
      <label>
        <xsl:value-of select="$secId"/>
      </label>
      <title>
        <xsl:apply-templates select="node() except span" />
      </title>  
    </sec>
  </xsl:template>
  
  <!-- ul / list -->
  <xsl:template match="ul">
    <list list-type="bullet">
      <xsl:apply-templates select="node()|@*"/>
    </list>
  </xsl:template>
  
  <!-- li / list-item -->
  <xsl:template match="li">
    <list-item>
      <xsl:apply-templates select="node()|@*"/>
    </list-item>
  </xsl:template>
  
</xsl:stylesheet>

Short description:

I have this flat HTML structure which needs to be transformed to nested XML structure. The original HTML structure may use h1 to h6 headings and they should be transformed into nested output XML sections accordingly. Each heading (h1...h6) has its own class (h1-title...h6-title). The HTML is always "well-structured", meaning h1 can be followed only by h2 or h3, etc. The wrong format (i.e. h1->h3->h2) may never occur.

I have two issues:

  1. I believe the transformation needs to be done with recursion, but I am unable to figure it out with XSLT. I managed to create the right XML output structure and re-tag everything accordingly, but I'm unable to set nested structure.

  2. The second (small) issue is that I don't know how to strip leading/trailing spaces from XML output tag and at the same time use "node() except span"? Function normalize-space() in this case returns an error.

I will be eternally grateful (and I mean it!) to someone who can solve this recursive mystery above for me.

Futurion
  • 23
  • 6
  • See answers like on similar questions https://stackoverflow.com/questions/31983515/sectioning-different-heading-levels/31985976?r=SearchResults#31985976 – Martin Honnen Sep 23 '20 at 12:36

1 Answers1

0

Three days ago I touched the XSLT code for the first time. Today I'm posting my first "achievement" which is based on Martin Honnen's golden function (html(h)->xml(sec)). I believe the code is ugly and I don't know if it's written by all standards as it should be, but the end result is correct so I'll post it as an answer to my question for now. If there are some anomalities/issues still present, I'll be glad to fix it, if someone can comment.

It looks like this:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:transform
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:fn="http://www.w3.org/2005/xpath-functions"
  xmlns:xs="http://www.w3.org/2001/XMLSchema"
  xmlns:mf="http://example.com/mf"
  exclude-result-prefixes="fn xs mf"
  version="2.0">
  
  <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
  
  <!-- all -->
  <xsl:template match="node()|@*">
    <xsl:copy>
      <xsl:apply-templates select="node()|@*"/>
    </xsl:copy>
  </xsl:template>
  
  <!-- html / xml -->
  <xsl:template match="html">
    <xml>
      <xsl:apply-templates select="node()|@*"/>
    </xml>
  </xsl:template>
  
  <!-- head / front -->
  <xsl:template match="head">
    <front type="head">
      <xsl:apply-templates select="node()|@*"/>
    </front>
  </xsl:template>
  
  <!-- flat html (h1-h6) to nested xml (sec) transformation -->
  <xsl:function name="mf:group" as="node()*">
    <xsl:param name="nodes" as="node()*"/>
    <xsl:param name="level" as="xs:integer"/>
    <xsl:for-each-group select="$nodes" group-starting-with="*[starts-with(local-name(), concat('h', $level))]">
      <xsl:choose>
        <xsl:when test="self::*[starts-with(local-name(), concat('h', $level))]">
          <sec>
            <xsl:apply-templates select="."/>
            <xsl:sequence select="mf:group(current-group() except ., $level+1)"/>
          </sec>
        </xsl:when>
        <xsl:when test="$level lt 6">
          <xsl:sequence select="mf:group(current-group(), $level+1)"/>
        </xsl:when>
        <xsl:otherwise>
          <xsl:apply-templates select="current-group()"/>
        </xsl:otherwise>
      </xsl:choose>
    </xsl:for-each-group>
  </xsl:function>
  
  <!-- article / -->
  <xsl:template match="article">
    <xsl:sequence select="mf:group(node(), 1)"/>
  </xsl:template>
  
  <!-- h1-h6 / sec -->
  <xsl:template match="(h1|h2|h3|h4|h5|h6)[@class='h-title']">
    <xsl:variable name="secId" select="normalize-space(span)"/>
    <xsl:variable name="secType" select="fn:substring-after(normalize-space(.), ' ')"/>
    <xsl:attribute name="id" select="normalize-space(concat('s', $secId))"/>
    <xsl:attribute name="sec-type" select="$secType"/>
    <label>
      <xsl:value-of select="$secId"/>
    </label>
    <title>
      <xsl:apply-templates select="node() except span" />
    </title>
  </xsl:template>
  
  <!-- ul / list -->
  <xsl:template match="ul">
    <list list-type="bullet">
      <xsl:apply-templates select="node()|@*"/>
    </list>
  </xsl:template>
  
  <!-- li / list-item -->
  <xsl:template match="li">
    <list-item>
      <xsl:apply-templates select="node()|@*"/>
    </list-item>
  </xsl:template>
  
</xsl:transform>
Futurion
  • 23
  • 6