0

This transformable HTML5:

<!DOCTYPE html>
<html>
    <head>
    </head>
    <body>
        <table border="1">
            <caption>Complex Table</caption>
            <tbody>
                <tr>
                    <th>Title</th>
                    <th>Volume</th>
                    <th>Chapter</th>
                    <th>Stds.</th>
                    <th>Dept.</th>
                </tr>
                <tr>
                    <td rowspan="6">STEM</td>
                    <td rowspan="1">1</td>
                    <td rowspan="2">1</td>
                    <td>1 to 10</td>
                    <td rowspan="2">Biology</td>
                </tr>
                <tr>
                    <td rowspan="1">2</td>
                    <td>20 to 30</td>
                </tr>
                <tr>
                    <td rowspan="1">3</td>
                    <td rowspan="1">2</td>
                    <td>40 to 60</td>
                    <td rowspan="1">Chemistry</td>
                </tr>
                <tr>
                    <td>4</td>
                    <td>3</td>
                    <td>70 to 80</td>
                    <td>Physics</td>
                </tr>
                <tr>
                    <td rowspan="4">5</td>
                    <td rowspan="1">4</td>
                    <td>80 to 120</td>
                    <td rowspan="1">Math</td>
                </tr>
                <tr>
                    <td rowspan="1">5</td>
                    <td>120 to 135</td>
                    <td rowspan="1">Geometry</td>
                </tr>
            </tbody>
        </table>
        <table border="1">
            <caption>Simpler Table</caption>
            <tbody>
                <tr>
                    <th>Title</th>
                    <th>Volume</th>
                    <th>Chapter</th>
                    <th>Stds.</th>
                    <th>Dept.</th>
                </tr>
                <tr>
                    <td colspan="1" rowspan="3">Kinesiology</td>
                    <td>1</td>
                    <td>1</td>
                    <td>A to C</td>
                    <td>Strength</td>
                </tr>
                <tr>
                    <td>2</td>
                    <td>2 to 3</td>
                    <td>D to H</td>
                    <td>Agility</td>
                </tr>
                <tr>
                    <td>3</td>
                    <td>4</td>
                    <td>I to X</td>
                    <td>Flexibility</td>
                </tr>
            </tbody>
        </table>
        <table border="1">
            <caption>Simplest Table</caption>
            <tbody>
                <tr>
                    <th>Title</th>
                    <th>Volume</th>
                    <th>Chapter</th>
                    <th>Stds.</th>
                    <th>Dept.</th>
                </tr>
                <tr>
                    <td>Skills</td>
                    <td>1</td>
                    <td>1</td>
                    <td>A to C</td>
                    <td>Keyboard</td>
                </tr>
            </tbody>
        </table>        
    </body>
</html>

This desired output (if you view the rendered HTML, you can see the pattern of data wanted):

<?xml version="1.0" encoding="UTF-8"?>
<production>
    <book title="STEM" volume="1"/>
    <book title="STEM" volume="2"/>
    <book title="STEM" volume="3"/>
    <book title="STEM" volume="4"/>
    <book title="STEM" volume="5"/>
    <book title="Kinesiology" volume="1"/>
    <book title="Kinesiology" volume="2"/>
    <book title="Kinesiology" volume="3"/>
    <book title="Skills" volume="1"/>
</production>

The not quite working transform:

<xsl:stylesheet
    version="2.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    exclude-result-prefixes="xs">

    <xsl:output method="xml" encoding="UTF-8" indent="yes" />

    <xsl:template match="/">
        <catalog>
            <xsl:apply-templates/>
        </catalog>
    </xsl:template>

    <xsl:template match="text()"/>

    <!-- multi-volume edition -->
    <xsl:template match="table">
        <xsl:variable name="title" select="descendant::td[1]"/>
        <xsl:variable name="context-td" select="."/>
        <!-- the following needs work -->
        <xsl:for-each select="descendant::tr/td[1][matches(.,'\d+$')]">
            <book>
                <xsl:attribute name="title" select="$title"/>
                <xsl:attribute name="volume" select="."/>
            </book>            
        </xsl:for-each>
    </xsl:template>

    <!-- single-volume edition -->
    <xsl:template match="table[count(descendant::tr) &lt; 3]">
        <book>
            <xsl:attribute name="title" select="descendant::td[1]"/>
            <xsl:attribute name="volume" select="descendant::tr[2]/td[2]"/>
        </book>            
    </xsl:template>        
</xsl:stylesheet>

The xpath in for-each needs work. I've tried various axis but haven't found one that works across all use cases.

Paulb
  • 1,471
  • 2
  • 16
  • 39

3 Answers3

2

Couldn't this be simply:

XSLT 2.0

<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes" />

<xsl:template match="/">
    <catalog>
        <xsl:apply-templates select="html/body/table"/>
    </catalog>
</xsl:template>

<xsl:template match="table">
    <xsl:variable name="title" select="tbody/tr[2]/td[1]"/>
    <xsl:for-each select="tbody/tr[2]/td[2] | tbody/tr[position() > 2]/td[1]">
        <book>
            <xsl:attribute name="title" select="$title"/>
            <xsl:attribute name="volume" select="."/>
        </book>            
    </xsl:for-each>
</xsl:template>

</xsl:stylesheet>

Oops, I see there is a problem with volume 5 of STEM being listed twice - hold on...


No, I don't see a simple solution to this. I suspect you'd have to drill down into the table's structure, taking preceding rowspans into consideration - somewhat similar to: Please suggest for XSLT code for Table rowspan and colspan issues


Edit:

Ok, I believe this should work:

XSLT 2.0

<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes" />

<xsl:template match="/">
    <catalog>
        <xsl:apply-templates select="html/body/table"/>
    </catalog>
</xsl:template>

<xsl:template match="table">
    <xsl:apply-templates select="tbody/tr[2]/td[2]">
        <xsl:with-param name="title" select="tbody/tr[2]/td[1]" tunnel="yes"/>
    </xsl:apply-templates>
</xsl:template>
 
<xsl:template match="td">
    <xsl:param name="title" tunnel="yes"/>
    <book>
        <xsl:attribute name="title" select="$title"/>
        <xsl:attribute name="volume" select="."/>
    </book>  
    <xsl:variable name="rowspan" select="if(@rowspan) then @rowspan else 1" />
    <xsl:apply-templates select="parent::tr/following-sibling::tr[number($rowspan)]/td[1]"/>
</xsl:template> 
 
</xsl:stylesheet>

Test, applied to a modified input in the form of: enter image description here

http://xsltransform.net/94hvTz1/2

Community
  • 1
  • 1
michael.hor257k
  • 113,275
  • 6
  • 33
  • 51
  • Thanks. I ran it on several production files--success. I learned some new things. tunnel="yes" is new to me, as was xsltransform.net – Paulb Dec 05 '14 at 07:59
1

I tried it with grouping:

<xsl:stylesheet
    version="2.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    exclude-result-prefixes="xs">

    <xsl:output method="xml" encoding="UTF-8" indent="yes" />

    <xsl:template match="/">
        <catalog>
          <xsl:apply-templates select="//table"/>
        </catalog>
    </xsl:template>

    <xsl:template match="table">
            <xsl:for-each-group select="tbody/tr[position() gt 1]/td[1]" group-by="../../(tr[2]/td[2] | tr[position() gt 2]/td[1])">
              <book title="{.}" volume="{current-grouping-key()}"/>
            </xsl:for-each-group>
    </xsl:template>

</xsl:stylesheet>
Martin Honnen
  • 160,499
  • 6
  • 90
  • 110
  • That's pretty impressive. I'll have to study that group-by statement so I can uderstand what it's doing. – Paulb Dec 04 '14 at 10:45
  • I suspect you're playing off a coincidence. If you give your volumes and chapters distinct names... – michael.hor257k Dec 04 '14 at 10:47
  • @michael.hor257k, I think you are right, the result for the sample is correct, but the approach does not work in general, we need some way to eliminate those `td[1]` elements that are in row where a previous `td` with some `rowspan` covers that column. – Martin Honnen Dec 04 '14 at 10:51
  • Mike... the volumes will ALWAYS be digits only, if that helps. That's why I considered an xpath with matches(.,'\d+$') in it. – Paulb Dec 04 '14 at 10:57
  • @Paulb The problem is that so are (or can be) the chapters. – michael.hor257k Dec 04 '14 at 11:49
0

Would this help by any chance(little changes made to michael.hor257k's answer) :

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes" />
<xsl:template match="/">
    <catalog>
        <xsl:apply-templates select="html/body/table"/>
    </catalog>
</xsl:template>
<xsl:template match="table">
    <xsl:variable name="title" select="tbody/tr[2]/td[1]"/>
    <xsl:variable name="table-id" select="generate-id()"/>
    <xsl:for-each select="tbody/tr[2]/td[2] | tbody/tr[position() > 2]/td[1]">
        <xsl:variable name="curr-td" select="."/>
        <xsl:if test="not(exists(following::tr[td[1][generate-id(../../..) = $table-id and . = $curr-td]]))">
            <book>
                <xsl:attribute name="title" select="$title"/>
                <xsl:attribute name="volume" select="."/>
            </book>
        </xsl:if>
    </xsl:for-each>
</xsl:template>
</xsl:stylesheet>
Lingamurthy CS
  • 5,412
  • 2
  • 13
  • 21