Comparing 2 node sets based on attribute sequence

Question

I'm trying to build up a kind of library XML, comparing various nodes and combining them for later reuse. The logic should be fairly straightforward, if the tag_XX attribute value sequence of a given language is equal to the tag_YY attribute value sequence of another language, the nodes can be combined. See below XML example

<Book>
<Section>
    <GB>
        <Para tag_GB="L1">
            <Content_GB>string_1</Content_GB>
        </Para>
        <Para tag_GB="Illanc">
            <Content_GB>string_2</Content_GB>
        </Para>
        <Para tag_GB="|PLB">
            <Content_GB>string_3</Content_GB>
        </Para>
        <Para tag_GB="L1">
            <Content_GB>string_4</Content_GB>
        </Para>
        <Para tag_GB="Sub">
            <Content_GB>string_5</Content_GB>
        </Para>
        <Para tag_GB="L3">
            <Content_GB>string_6</Content_GB>
        </Para>
        <Para tag_GB="Subbull">
            <Content_GB>string_7</Content_GB>
        </Para>
    </GB>
    <!-- German translations - OK because same attribute sequence -->
    <DE>
        <Para tag_DE="L1">
            <Content_DE>German_translation of_string_1</Content_DE>
        </Para>
        <Para tag_DE="Illanc">
            <Content_DE>German_translation of_string_2</Content_DE>
        </Para>
        <Para tag_DE="|PLB">
            <Content_DE>German_translation of_string_3</Content_DE>
        </Para>
        <Para tag_DE="L1">
            <Content_DE>German_translation of_string_4</Content_DE>
        </Para>
        <Para tag_DE="Sub">
            <Content_DE>German_translation of_string_5</Content_DE>
        </Para>
        <Para tag_DE="L3">
            <Content_DE>German_translation of_string_6</Content_DE>
        </Para>
        <Para tag_DE="Subbull">
            <Content_DE>German_translation of_string_7</Content_DE>
        </Para>
    </DE>
    <!-- Danish translations - NG because not same attribute sequence -->
    <DK>
        <Para tag_DK="L1">
            <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
        </Para>
        <Para tag_DK="L1_sub">
            <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
        </Para>
        <Para tag_DK="Illanc">
            <Content_DK>Danish_translation_of_string_2</Content_DK>
        </Para>
        <Para tag_DK="L1">
            <Content_DK>Danish_translation_of_string_4</Content_DK>
        </Para>
        <Para tag_DK="|PLB">
            <Content_DK>Danish_translation_of_string_3</Content_DK>
        </Para>
        <Para tag_DK="L3">
            <Content_DK>Danish_translation_of_string_6</Content_DK>
        </Para>
        <Para tag_DK="Sub">
            <Content_DK>Danish_translation_of_string_5</Content_DK>
        </Para>
        <Para tag_DK="Subbull">
            <Content_DK>Danish_translation_of_string_7</Content_DK>
        </Para>
    </DK>
</Section>
</Book>

So

GB tag_GB value sequence = L1 -> Illanc -> ... -> SubBul

DE tag_DE value sequence = L1 -> Illanc -> ... -> SubBul (same as GB so ok)

DK tag_DK value sequence = L1 -> L1.sub -> Oops, expected Illanc meaning this sequence is not the same as GB and locale can be ignored

Since German and English node sets have the same attribute sequence I like to combine them as follows :

<Book>
<Dictionary>
    <Para tag="L1">
        <Content_GB>string_1</Content_GB>
        <Content_DE>German_translation of_string_1</Content_DE>
    </Para>
    <Para tag="Illanc">
        <Content_GB>string_2</Content_GB>
        <Content_DE>German_translation of_string_2</Content_DE>
    </Para>
    <Para tag="|PLB">
        <Content_GB>string_3</Content_GB>
        <Content_DE>German_translation of_string_3</Content_DE>
    </Para>
    <Para tag="L1">
        <Content_GB>string_4</Content_GB>
        <Content_DE>German_translation of_string_4</Content_DE>
    </Para>
    <Para tag="Sub">
        <Content_GB>string_5</Content_GB>
        <Content_DE>German_translation of_string_5</Content_DE>
    </Para>
    <Para tag="L3">
        <Content_GB>string_6</Content_GB>
        <Content_DE>German_translation of_string_6</Content_DE>
    </Para>
    <Para tag="Subbull">
        <Content_GB>string_7</Content_GB>
        <Content_DE>German_translation of_string_7</Content_DE>
    </Para>
</Dictionary>
</Book>

The stylesheet I use is the following :

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" version="1.0" xmlns="http://www.w3.org/1999/xhtml" encoding="UTF-8" indent="yes"/>
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:template match="/">
    <xsl:copy>
        <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
</xsl:template>
<xsl:template match="@* | node()">
    <xsl:copy>
        <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
</xsl:template>
<xsl:template match="text()">
    <xsl:value-of select="normalize-space(.)"/>
</xsl:template>
<xsl:template match="Section">
    <!-- store reference tag list -->
    <xsl:variable name="Ref_tagList" select="GB/Para/attribute()[1]"/>
    <Dictionary>
        <xsl:for-each select="GB/Para">
            <xsl:variable name="pos" select="position()"/>
            <Para tag="{@tag_GB}">
                <!-- Copy English Master -->
                <xsl:apply-templates select="element()[1]"/>
                <xsl:for-each select="//Book/Section/element()[not(self::GB)]">
                    <!-- store current locale tag list -->
                    <xsl:variable name="Curr_tagList" select="Para/attribute()[1]"/>
                    <xsl:if test="$Ref_tagList = $Curr_tagList">
                        <!-- Copy current locale is current tag list equals reference tag list -->
                        <xsl:apply-templates select="Para[position()=$pos]/element()[1]"/>
                    </xsl:if>
                </xsl:for-each>
            </Para>
        </xsl:for-each>
    </Dictionary>
</xsl:template>
</xsl:stylesheet>

Apart from probably not the most efficient way to do this (I'm fairly new to the xslt game...) it's not working either. The logic I had in mind is to take the attribute set of the English master, and if the attribute set of any other locale is equal I copy, if not I ignore. But for some reason also nodesets that have a different attribute sequence are happily copied (as seen in below). Can some one tell me where my logic conflicts with reality ? Thanks in advance !

Current output Including Danish that should have been ignored ...

<Book>
<Dictionary>
    <Para tag="L1">
        <Content_GB>string_1</Content_GB>
        <Content_DE>German_translation of_string_1</Content_DE>
        <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
    </Para>
    <Para tag="Illanc">
        <Content_GB>string_2</Content_GB>
        <Content_DE>German_translation of_string_2</Content_DE>
        <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
    </Para>
    <Para tag="|PLB">
        <Content_GB>string_3</Content_GB>
        <Content_DE>German_translation of_string_3</Content_DE>
        <Content_DK>Danish_translation_of_string_2</Content_DK>
    </Para>
    <Para tag="L1">
        <Content_GB>string_4</Content_GB>
        <Content_DE>German_translation of_string_4</Content_DE>
        <Content_DK>Danish_translation_of_string_4</Content_DK>
    </Para>
    <Para tag="Sub">
        <Content_GB>string_5</Content_GB>
        <Content_DE>German_translation of_string_5</Content_DE>
        <Content_DK>Danish_translation_of_string_3</Content_DK>
    </Para>
    <Para tag="L3">
        <Content_GB>string_6</Content_GB>
        <Content_DE>German_translation of_string_6</Content_DE>
        <Content_DK>Danish_translation_of_string_6</Content_DK>
    </Para>
    <Para tag="Subbull">
        <Content_GB>string_7</Content_GB>
        <Content_DE>German_translation of_string_7</Content_DE>
        <Content_DK>Danish_translation_of_string_5</Content_DK>
    </Para>
</Dictionary>
</Book>

do you only wish to group the ones in sequence? or group ones that match? — Treemonkey, Jul 14 '11 at 14:49
The full 'section' has to match. In reality there are far more content strings in a group, with quite some variations in the tagging. So let's say the GB section has 50 Para's, the German section should also have 50 Para's, with attributes in exactly the same sequence. — Wokoman, Jul 14 '11 at 15:18
Also, what if there was another sequence of language Para elements that matched DK. Which sequence pattern should be used? Or would you want two Dictionary elements that grouped (GB,DE) and then (DK, XX)? — Mads Hansen, Jul 14 '11 at 15:20
Just for the record, ``, `` etc. is really bad XML design. I'd STRONGLY recommend using ``, `` etc. I'll make working with schemas and XSLT a LOT easier. — Flynn1179, Jul 14 '11 at 15:24
Hi Flynn, Fully aware of that, but I left this out to keep the code a bit condensed. My source data is structured as shown, final data should indeed use xml:lang and do something with the ones that don't match also — Wokoman, Jul 14 '11 at 15:28
@Mads : Indeed, that's the final idea. I'm actually having 32 different language files, but I figured that if I could get the ones that match already combined I should be able to use the same logic recursively for the ones that don't match. If there is a way to have this all in one go I be more than happy to learn about that of course. — Wokoman, Jul 14 '11 at 15:30
+1 for detailed question, effort and _crazy_ input document. — Emiliano Poggi, Jul 14 '11 at 16:16

Emiliano Poggi · Accepted Answer · 2011-07-14T16:29:10.213

This is might not be the best solution. I've used the following XSLT 2.0 features:

I compared the sequence of attributes using string-join().
I've exploited the possibility of using RTF variables

There are probably more XSLT 2.0 facilities which can resolve your problem. but I think the BIG problem here is your input document.

I'm sorry did not have a look to your current transform. Just implemented one from scratch. Hope it helps:

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="GB">
        <Book>
            <Dictionary>

                <xsl:variable name="matches">
                    <xsl:for-each select="following-sibling::*
                        [string-join(Para/@*,'-')
                        = string-join(current()/Para/@*,'-')]">
                        <match><xsl:copy-of select="Para/*"/></match>
                    </xsl:for-each>
                </xsl:variable>

                <xsl:apply-templates select="Para">
                    <xsl:with-param name="matches" select="$matches"/>
                </xsl:apply-templates>

            </Dictionary>
        </Book>
    </xsl:template>

    <xsl:template match="Para[parent::GB]">
        <xsl:param name="matches"/>
        <xsl:variable name="pos" select="position()"/>
        <Para tag="{@tag_GB}">
            <xsl:copy-of select="Content_GB"/>
            <xsl:copy-of select="$matches/match/*[position()=$pos]"/>
        </Para>
    </xsl:template>

    <xsl:template match="text()"/>

</xsl:stylesheet>

When applied to the input document provided in the question, the following output is produced:

<Book>
   <Dictionary>
      <Para tag="L1">
         <Content_GB>string_1</Content_GB>
         <Content_DE>German_translation of_string_1</Content_DE>
      </Para>
      <Para tag="Illanc">
         <Content_GB>string_2</Content_GB>
         <Content_DE>German_translation of_string_2</Content_DE>
      </Para>
      <Para tag="|PLB">
         <Content_GB>string_3</Content_GB>
         <Content_DE>German_translation of_string_3</Content_DE>
      </Para>
      <Para tag="L1">
         <Content_GB>string_4</Content_GB>
         <Content_DE>German_translation of_string_4</Content_DE>
      </Para>
      <Para tag="Sub">
         <Content_GB>string_5</Content_GB>
         <Content_DE>German_translation of_string_5</Content_DE>
      </Para>
      <Para tag="L3">
         <Content_GB>string_6</Content_GB>
         <Content_DE>German_translation of_string_6</Content_DE>
      </Para>
      <Para tag="Subbull">
         <Content_GB>string_7</Content_GB>
         <Content_DE>German_translation of_string_7</Content_DE>
      </Para>
   </Dictionary>
</Book>

Thanks Empo, really appreciate the efforts, I used Mads adaption in the end, but the methods are pretty similar. Learned again some valuable stuff today ! — Wokoman, Jul 14 '11 at 18:05

score 1 · Answer 2 · answered Jul 14 '11 at 17:41

This stylesheet makes use of <xsl:for-each-group>

First, groups the elements by their sequence of Para/@* values
Then, for each of those sequences, groups the Para using the number of following sibling elements that have attributes that start with "tag".

I have predicate filters on the matches for @*, to ensure that it is comparing the ones that start with "tag_". That may not be necessary, but would help ensure that it still worked if other attributes were added to the instance XML.

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" version="1.0" xmlns="http://www.w3.org/1999/xhtml" encoding="UTF-8"
        indent="yes"/>
    <xsl:output omit-xml-declaration="yes" indent="yes"/>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@* | node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="text()" priority="1">
        <xsl:value-of select="normalize-space(.)"/>
    </xsl:template>

    <xsl:template match="Section">
        <xsl:for-each-group select="*"
            group-adjacent="string-join(
            Para/@*[starts-with(local-name(),'tag_')],'|')">
            <Dictionary>
                <xsl:for-each-group select="current-group()/Para"
                    group-by="count(
                    following-sibling::*[@*[starts-with(local-name(),'tag_')]])">
                    <Para tag="{(current-group()/@*[starts-with(local-name(),'tag_')])[1]}">
                        <xsl:copy-of select="current-group()/*"/>
                    </Para>
                </xsl:for-each-group>
            </Dictionary>
        </xsl:for-each-group>
    </xsl:template>

</xsl:stylesheet>

When applied to the sample input XML, produces the following output:

<Book>
   <Dictionary>
      <Para tag="L1">
         <Content_GB>string_1</Content_GB>
         <Content_DE>German_translation of_string_1</Content_DE>
      </Para>
      <Para tag="Illanc">
         <Content_GB>string_2</Content_GB>
         <Content_DE>German_translation of_string_2</Content_DE>
      </Para>
      <Para tag="|PLB">
         <Content_GB>string_3</Content_GB>
         <Content_DE>German_translation of_string_3</Content_DE>
      </Para>
      <Para tag="L1">
         <Content_GB>string_4</Content_GB>
         <Content_DE>German_translation of_string_4</Content_DE>
      </Para>
      <Para tag="Sub">
         <Content_GB>string_5</Content_GB>
         <Content_DE>German_translation of_string_5</Content_DE>
      </Para>
      <Para tag="L3">
         <Content_GB>string_6</Content_GB>
         <Content_DE>German_translation of_string_6</Content_DE>
      </Para>
      <Para tag="Subbull">
         <Content_GB>string_7</Content_GB>
         <Content_DE>German_translation of_string_7</Content_DE>
      </Para>
   </Dictionary>
   <Dictionary>
      <Para tag="L1">
         <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
      </Para>
      <Para tag="L1_sub">
         <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
      </Para>
      <Para tag="Illanc">
         <Content_DK>Danish_translation_of_string_2</Content_DK>
      </Para>
      <Para tag="L1">
         <Content_DK>Danish_translation_of_string_4</Content_DK>
      </Para>
      <Para tag="|PLB">
         <Content_DK>Danish_translation_of_string_3</Content_DK>
      </Para>
      <Para tag="L3">
         <Content_DK>Danish_translation_of_string_6</Content_DK>
      </Para>
      <Para tag="Sub">
         <Content_DK>Danish_translation_of_string_5</Content_DK>
      </Para>
      <Para tag="Subbull">
         <Content_DK>Danish_translation_of_string_7</Content_DK>
      </Para>
   </Dictionary>
</Book>

Thanks Mads, works like a dream. You even anticipated on the fact I indeed have more than one attribute in my source from hell :-) — Wokoman, Jul 14 '11 at 18:03
+1 for advanced use of `xsl:for-each-group`, even if I've found it not really intuitive. — Emiliano Poggi, Jul 15 '11 at 06:08

Comparing 2 node sets based on attribute sequence

2 Answers2