0

i'm trying to transform a text file to an xml file using xslt 2 transformation.

This is the content of the text file:

0000000001  0000000001  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000002  0000000002  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000003  0000000003  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford

These columns are separated by "tabulation" key (tab). Some columns are empty but a tab key is used to separate this column from the others. So I've used a regex to do this (the regex is not a simplified form but it can be optimized I think)

This is my XSLT file:

<?xml version="1.1" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
    xmlns:xsi="setClients.xsd"
    exclude-result-prefixes="xs xd"
    version="2.0">

    <xsl:output indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:param name="txt-encoding" as="xs:string" select="'iso-8859-1'"/>
    <xsl:param name="txt-uri" as="xs:string" select="'file:///xxxxxxx.txt'"/>

    <xsl:variable name="txt" select="unparsed-text($txt-uri, $txt-encoding)"/>
    <xsl:variable name="entries" as="node()*">
        <xsl:analyze-string select="$txt" regex="\r\n?|\n">
            <xsl:non-matching-substring>
                <xsl:analyze-string select="." regex="(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)">
                    <xsl:matching-substring>
                        <entry>

                            <!-- * infos client -->
                            <c_id><xsl:value-of select="regex-group(1)"/></c_id>
                            <c_shipo_id><xsl:value-of select="normalize-space(regex-group(2))"/></c_shipo_id>
                            <c_company_id><xsl:value-of select="normalize-space(regex-group(3))"/></c_company_id>
                            <c_type_client><xsl:value-of select="normalize-space(regex-group(4))"/></c_type_client>
                            <c_classe><xsl:value-of select="normalize-space(regex-group(5))"/></c_classe>
                            <c_sous_type_client><xsl:value-of select="normalize-space(regex-group(6))"/></c_sous_type_client>
                            <c_start_date><xsl:value-of select="normalize-space(regex-group(7))"/></c_start_date>
                            <c_type_personne><xsl:value-of select="normalize-space(regex-group(8))"/></c_type_personne>
                            <c_type_personne><xsl:value-of select="normalize-space(regex-group(10))"/></c_type_personne>
                            <c_type_document><xsl:value-of select="normalize-space(regex-group(11))"/></c_type_document>
                            <c_num_tva><xsl:value-of select="normalize-space(regex-group(12))"/></c_num_tva>
                            <c_pays><xsl:value-of select="normalize-space(regex-group(13))"/></c_pays>
                            <c_pays_id><xsl:value-of select="normalize-space(regex-group(14))"/></c_pays_id>
                            <c_raison_sociale><xsl:value-of select="normalize-space(regex-group(15))"/></c_raison_sociale>
                            <c_civilité><xsl:value-of select="normalize-space(regex-group(16))"/></c_civilité>
                            <c_name><xsl:value-of select="normalize-space(regex-group(17))"/></c_name>
                            <c_prenom><xsl:value-of select="normalize-space(regex-group(18))"/></c_prenom>
                            <c_complement><xsl:value-of select="normalize-space(regex-group(19))"/></c_complement>
                            <c_adresse_forcee><xsl:value-of select="normalize-space(regex-group(20))"/></c_adresse_forcee>
                            <c_complement_adr><xsl:value-of select="normalize-space(regex-group(21))"/></c_complement_adr>
                            <c_numero><xsl:value-of select="normalize-space(regex-group(22))"/></c_numero>
                            <c_complement_numero><xsl:value-of select="normalize-space(regex-group(23))"/></c_complement_numero>
                            <c_adresse_rue><xsl:value-of select="normalize-space(regex-group(24))"/></c_adresse_rue>
                            <c_lieu_dit><xsl:value-of select="normalize-space(regex-group(25))"/></c_lieu_dit>
                            <c_code_postal><xsl:value-of select="normalize-space(regex-group(26))"/></c_code_postal>
                            <c_localite><xsl:value-of select="normalize-space(regex-group(27))"/></c_localite>
                            <c_telephone><xsl:value-of select="normalize-space(regex-group(28))"/></c_telephone>
                            <c_telephone_mobile><xsl:value-of select="normalize-space(regex-group(29))"/></c_telephone_mobile>
                            <c_email><xsl:value-of select="normalize-space(regex-group(30))"/></c_email>
                            <c_fax><xsl:value-of select="normalize-space(regex-group(31))"/></c_fax>
                            <c_actif><xsl:value-of select="normalize-space(regex-group(32))"/></c_actif>
                            <c_date_creation><xsl:value-of select="normalize-space(regex-group(33))"/></c_date_creation>
                            <c_langue><xsl:value-of select="normalize-space(regex-group(34))"/></c_langue>
                            <c_sapcode><xsl:value-of select="normalize-space(regex-group(35))"/></c_sapcode>
                            <c_invoicecopies><xsl:value-of select="normalize-space(regex-group(36))"/></c_invoicecopies>
                            <c_flttax><xsl:value-of select="normalize-space(regex-group(37))"/></c_flttax>
                            <c_flttax1><xsl:value-of select="normalize-space(regex-group(38))"/></c_flttax1>
                            <c_flttax2><xsl:value-of select="normalize-space(regex-group(39))"/></c_flttax2>
                            <c_flttax3><xsl:value-of select="normalize-space(regex-group(40))"/></c_flttax3>
                            <c_fldistax><xsl:value-of select="normalize-space(regex-group(41))"/></c_fldistax>
                            <c_equaladressbilling><xsl:value-of select="normalize-space(regex-group(42))"/></c_equaladressbilling>
                            <c_equaladressinvoice><xsl:value-of select="normalize-space(regex-group(43))"/></c_equaladressinvoice>
                            <c_equaladresspayment><xsl:value-of select="normalize-space(regex-group(44))"/></c_equaladresspayment>
                            <c_collectiontype><xsl:value-of select="normalize-space(regex-group(45))"/></c_collectiontype>
                            <c_companycode><xsl:value-of select="normalize-space(regex-group(46))"/></c_companycode>
                            <c_collectionid><xsl:value-of select="normalize-space(regex-group(47))"/></c_collectionid>
                            <c_duedatefree><xsl:value-of select="normalize-space(regex-group(48))"/></c_duedatefree>
                            <c_payements><xsl:value-of select="normalize-space(regex-group(49))"/></c_payements>
                            <c_paynum><xsl:value-of select="normalize-space(regex-group(50))"/></c_paynum>
                            <c_iban><xsl:value-of select="normalize-space(regex-group(51))"/></c_iban>
                            <c_mandate><xsl:value-of select="normalize-space(regex-group(52))"/></c_mandate>
                            <c_mandatedate><xsl:value-of select="normalize-space(regex-group(53))"/></c_mandatedate>
                            <c_expdate><xsl:value-of select="normalize-space(regex-group(54))"/></c_expdate>
                            <c_securitycode><xsl:value-of select="normalize-space(regex-group(55))"/></c_securitycode>
                            <c_authornum><xsl:value-of select="normalize-space(regex-group(56))"/></c_authornum>
                            <c_cardtype><xsl:value-of select="normalize-space(regex-group(57))"/></c_cardtype>
                            <c_bank><xsl:value-of select="normalize-space(regex-group(58))"/></c_bank>
                            <c_fixedue><xsl:value-of select="normalize-space(regex-group(59))"/></c_fixedue>
                            <c_duelastday><xsl:value-of select="normalize-space(regex-group(60))"/></c_duelastday>
                            <c_dateini><xsl:value-of select="normalize-space(regex-group(61))"/></c_dateini>
                            <c_datefin><xsl:value-of select="normalize-space(regex-group(62))"/></c_datefin>
                            <c_compfactid><xsl:value-of select="normalize-space(regex-group(63))"/></c_compfactid>
                            <c_persontype><xsl:value-of select="normalize-space(regex-group(64))"/></c_persontype>

                            <!-- * infos ptv -->

                            <p_id><xsl:value-of select="normalize-space(regex-group(65))"/></p_id>
                            <p_type><xsl:value-of select="normalize-space(regex-group(66))"/></p_type>
                            <p_useguide><xsl:value-of select="normalize-space(regex-group(67))"/></p_useguide>
                            <p_controldelivery><xsl:value-of select="normalize-space(regex-group(68))"/></p_controldelivery>
                            <p_copiesinitonclose><xsl:value-of select="normalize-space(regex-group(69))"/></p_copiesinitonclose>
                            <p_nominative><xsl:value-of select="normalize-space(regex-group(70))"/></p_nominative>
                            <p_sapcode><xsl:value-of select="normalize-space(regex-group(71))"/></p_sapcode>
                            <p_pays><xsl:value-of select="normalize-space(regex-group(72))"/></p_pays>
                            <p_raison_sociale><xsl:value-of select="normalize-space(regex-group(73))"/></p_raison_sociale>
                            <p_civilite><xsl:value-of select="normalize-space(regex-group(74))"/></p_civilite>
                            <p_nom><xsl:value-of select="normalize-space(regex-group(75))"/></p_nom>
                            <p_prenom><xsl:value-of select="normalize-space(regex-group(76))"/></p_prenom>
                            <p_complement><xsl:value-of select="normalize-space(regex-group(77))"/></p_complement>
                            <p_adresse_forcee><xsl:value-of select="normalize-space(regex-group(78))"/></p_adresse_forcee>
                            <p_complement_adr><xsl:value-of select="normalize-space(regex-group(79))"/></p_complement_adr>
                            <p_numero><xsl:value-of select="normalize-space(regex-group(80))"/></p_numero>
                            <p_complement_numero><xsl:value-of select="normalize-space(regex-group(81))"/></p_complement_numero>
                            <p_adresse_rue><xsl:value-of select="normalize-space(regex-group(82))"/></p_adresse_rue>
                            <p_lieu_dit><xsl:value-of select="normalize-space(regex-group(83))"/></p_lieu_dit>
                            <p_code_postal><xsl:value-of select="normalize-space(regex-group(84))"/></p_code_postal>
                            <p_localite><xsl:value-of select="normalize-space(regex-group(85))"/></p_localite>
                            <p_telephone><xsl:value-of select="normalize-space(regex-group(86))"/></p_telephone>
                            <p_telephone_mobile><xsl:value-of select="normalize-space(regex-group(87))"/></p_telephone_mobile>
                            <p_email><xsl:value-of select="normalize-space(regex-group(88))"/></p_email>
                            <p_fax><xsl:value-of select="normalize-space(regex-group(89))"/></p_fax>
                            <p_deliverydays><xsl:value-of select="normalize-space(regex-group(90))"/></p_deliverydays>
                            <p_active><xsl:value-of select="normalize-space(regex-group(91))"/></p_active>
                            <p_gestamp><xsl:value-of select="normalize-space(regex-group(92))"/></p_gestamp>
                            <p_numamp><xsl:value-of select="normalize-space(regex-group(93))"/></p_numamp>
                            <p_numcasieramp><xsl:value-of select="normalize-space(regex-group(94))"/></p_numcasieramp>
                            <p_numboxamp><xsl:value-of select="normalize-space(regex-group(95))"/></p_numboxamp>
                            <p_distributeur><xsl:value-of select="normalize-space(regex-group(96))"/></p_distributeur>
                            <p_enseigneamp><xsl:value-of select="normalize-space(regex-group(97))"/></p_enseigneamp>
                            <p_typeamp><xsl:value-of select="normalize-space(regex-group(98))"/></p_typeamp>
                            <zero><xsl:value-of select="normalize-space(regex-group(99))"/></zero>

                        </entry>

                    </xsl:matching-substring>
                </xsl:analyze-string>
            </xsl:non-matching-substring>
        </xsl:analyze-string>
    </xsl:variable>

    <xsl:template match="/" name="text2xml">

        <clients>
            <xsl:for-each select="$entries">
                <client>
                    <xsl:attribute name="companyID">
                        <xsl:value-of select="c_company_id"/>
                    </xsl:attribute>
                    <xsl:attribute name="clientID" >
                        <xsl:value-of select="c_id"/>
                    </xsl:attribute>
                    <general>
                        <xsl:attribute name="startDate">
                            <xsl:value-of select="c_start_date"/>
                        </xsl:attribute>
                        <xsl:attribute name="country">
                            <xsl:value-of select="c_pays_id"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientType">
                            <xsl:value-of select="c_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientSubtype">
                            <xsl:value-of select="c_sous_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientClass">
                            <xsl:value-of select="c_classe"/>
                        </xsl:attribute>
                    </general>
                </client>


            </xsl:for-each>     
        </clients>
    </xsl:template>
</xsl:stylesheet>

So when I generate an XML based on this xslt file (executed on oxygen XML EDITOR) it takes one second to do the stuff when the number of columns are < 20. When I add some othe columns, latency appears and I should wait more than 15 seconds to generate the XML file. It's a bad thing for me because I need to treat 99 columns in my XSLT file.

Can you tell me why there is a problem of performance there? What should I do? Any advice could help me.

Thanks.

amin89
  • 558
  • 1
  • 8
  • 26
  • So which XSLT processor do you use with oXygen? Have you tried whether using Saxon EE improves performance? Have you tried whether using `` and then e.g. `` instead of `regex-group(1)` performs better? – Martin Honnen Dec 08 '17 at 13:32
  • I'm using Saxon-EE 9.3.0 and tried PE & HE from the same version. No changes. I didn't use the tokenize function. Maybe I should. I will tell if it works.thanks ;) – amin89 Dec 08 '17 at 13:34
  • @martin is it possible based on my code to give me your solution with the tokenize function? It's not working like I've implemented it. thank you in advance. – amin89 Dec 11 '17 at 11:02
  • 1
    I am afraid the sample you have posted does not contain any tabs at all and also I not in the mood to convert all that code for lots of columns by hand for you, I have set up http://xsltransform.hikmatu.com/bFukv8f to show that tokenize allows you to split into lines first and then into columns, you will need to adapt that code then to output your desired column element names e.g. `` for your purpose. http://xsltransform.hikmatu.com/bFukv8f/1 shows it for three columns. – Martin Honnen Dec 11 '17 at 11:31
  • thank you it's exactly what I've need. It's a good beginning. thank you ;) – amin89 Dec 11 '17 at 11:37
  • @MartinHonnen Now there is no latency it's fine but I have onother problem: When I make select="tokenize(., '\t+')"/> it doesn't count the /t like a column. If we take the example of 3 entries, it consider the "" the 6th column instead of the 7th ! What should I do? – amin89 Dec 11 '17 at 12:50
  • 1
    Don't use `tokenize(., '\t+')` for the columns, use `tokenize(., '\t')`, that way the returned sequence should have an empty string item if there are two adjacent tab characters. – Martin Honnen Dec 11 '17 at 13:20
  • I have morphed the suggestions made in the comments into an answer so you can mark your question as solved if the suggestions helped. – Martin Honnen Dec 11 '17 at 16:20

2 Answers2

2

I think the performance problem is almost certainly in the regular expression

regex="(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)\t(.*)"

and the reason it performs badly is that it's got a high level of ambiguity and therefore backtracking.

Remember that .* will match as many characters as it can - including tabs. So it will start by swallowing the whole line, then realise that this doesn't lead to a match, then backtrack until it finds a tab character, then realise that this fails again, and so on. The simplest fix would be to replace each (.*) by ([^\t]*) (that is, a sequence of any characters except tabs). But tokenizing on tab as a separator, as @MartinHonnen suggests, is much simpler.

Michael Kay
  • 156,231
  • 11
  • 92
  • 164
1

Based on our exchange in the comments I think you can try an alternative approach using tokenize instead of xsl:analyze-string to parse the input into lines and then the lines into columns and see whether that performs better.

An example using tokenize is

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"
  xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs">

   <xsl:variable name="tsv" as="xs:string">0000000001  0000000001  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000002  0000000002  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000003  0000000003  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford</xsl:variable>

   <xsl:output indent="yes"/>

   <xsl:variable name="lines" as="xs:string*" select="tokenize($tsv, '\r?\n')[normalize-space()]"/>
   <xsl:variable name="entries" as="node()*">

                    <xsl:for-each select="$lines">
                        <xsl:for-each select="tokenize(., '\s+')"><!-- if the input is really tab separated we need '\t' as the second argument for tokenize -->
                            <entry>
                                <col pos="{position()}">
                                    <xsl:value-of select="normalize-space()"/>
                                </col>
                            </entry>
                        </xsl:for-each>
                    </xsl:for-each>

    </xsl:variable>

    <xsl:template match="/" name="text2xml">

        <clients>
           <xsl:copy-of select="$entries"/>

            <xsl:for-each select="$entries">
                <client>
                    <xsl:attribute name="companyID">
                        <xsl:value-of select="c_company_id"/>
                    </xsl:attribute>
                    <xsl:attribute name="clientID" >
                        <xsl:value-of select="c_id"/>
                    </xsl:attribute>
                    <general>
                        <xsl:attribute name="startDate">
                            <xsl:value-of select="c_start_date"/>
                        </xsl:attribute>
                        <xsl:attribute name="country">
                            <xsl:value-of select="c_pays_id"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientType">
                            <xsl:value-of select="c_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientSubtype">
                            <xsl:value-of select="c_sous_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientClass">
                            <xsl:value-of select="c_classe"/>
                        </xsl:attribute>
                    </general>
                </client>


            </xsl:for-each>     
        </clients>
    </xsl:template>
</xsl:transform>

that simply generates the same column name, to adjust to your custom column names use

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"
  xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs">

   <xsl:variable name="tsv" as="xs:string">0000000001  0000000001  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000002  0000000002  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford
0000000003  0000000003  ED  I   I       1900-01-01  I   VAT000000000000 BE  1   A       CO      S       451     LD      1010    Stanford</xsl:variable>

   <xsl:output indent="yes"/>

   <xsl:variable name="lines" as="xs:string*" select="tokenize($tsv, '\r?\n')[normalize-space()]"/>
   <xsl:variable name="entries" as="node()*">

                    <xsl:for-each select="$lines">

                            <entry>
                            <xsl:variable name="columns" select="tokenize(., '\s+')"/><!-- if the input is really tab separated we need '\t' as the second argument for tokenize -->
                          <c_id><xsl:value-of select="$columns[1]"/></c_id>
                            <c_shipo_id><xsl:value-of select="$columns[2]"/></c_shipo_id>
                            <c_company_id><xsl:value-of select="$columns[3]"/></c_company_id>
                            </entry>

                    </xsl:for-each>

    </xsl:variable>

    <xsl:template match="/" name="text2xml">

        <clients>

            <xsl:for-each select="$entries">
                <client>
                    <xsl:attribute name="companyID">
                        <xsl:value-of select="c_company_id"/>
                    </xsl:attribute>
                    <xsl:attribute name="clientID" >
                        <xsl:value-of select="c_id"/>
                    </xsl:attribute>
                    <general>
                        <xsl:attribute name="startDate">
                            <xsl:value-of select="c_start_date"/>
                        </xsl:attribute>
                        <xsl:attribute name="country">
                            <xsl:value-of select="c_pays_id"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientType">
                            <xsl:value-of select="c_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientSubtype">
                            <xsl:value-of select="c_sous_type_client"/>
                        </xsl:attribute>
                        <xsl:attribute name="clientClass">
                            <xsl:value-of select="c_classe"/>
                        </xsl:attribute>
                    </general>
                </client>


            </xsl:for-each>     
        </clients>
    </xsl:template>
</xsl:transform>

Online samples are at http://xsltransform.hikmatu.com/bFukv8f and http://xsltransform.hikmatu.com/bFukv8f/1, to tokenize columns where to adjacent tab characters indicate an empty column use tokenize(., '\t').

Martin Honnen
  • 160,499
  • 6
  • 90
  • 110