0

I'm trying to transform multiple METS-XML files by selecting "issues" via this XQuery:

(:  1:) xquery version "3.1";
(:  2:) 
(:  3:) import module namespace safename="http://my-xquery-module.org/safeName" at "xmldb:exist:///db/apps/digiZeitung/safeName.xqm";
(:  4:) 
(:  5:) declare namespace mets="http://www.loc.gov/METS/";
(:  6:) declare namespace mods="http://www.loc.gov/mods/v3";
(:  7:) declare namespace xlink="http://www.w3.org/1999/xlink";
(:  8:) declare namespace oai-pmh="http://www.openarchives.org/OAI/2.0/";
(:  9:) 
(: 10:) declare function local:url($issue as element()) {
(: 11:)     let $band := root($issue)
(: 12:)     let $top := $band/mets:mets/mets:structMap[@TYPE="LOGICAL"]//mets:div[not(mets:mptr)][1]
(: 13:)     let $dmdids := $top ! data(@DMDID) ! tokenize(., '\s+')
(: 14:)     let $mods := $band/mets:mets/mets:dmdSec[@ID=$dmdids]/mets:mdWrap/mets:xmlData/mods:mods
(: 15:)     return $mods/mods:location/mods:url[@access="object in context"]
(: 16:) };
(: 17:) declare function local:projectname($issue as element()) {
(: 18:)     let $url := local:url($issue)
(: 19:)     return tokenize($url, "/")[5]
(: 20:) };
(: 21:) 
(: 22:) declare function local:issue-date($issue as element()) {
(: 23:)     let $band := root($issue)
(: 24:)     let $dmdids := $issue ! data(@DMDID) ! tokenize(., '\s+')
(: 25:)     let $mods := $band/mets:mets/mets:dmdSec[@ID=$dmdids]/mets:mdWrap/mets:xmlData/mods:mods
(: 26:)     let $date := data($mods/mods:originInfo[@eventType='publication']/mods:dateIssued[@keyDate="yes" and @encoding='iso8601'][1])
(: 27:)     return $date
(: 28:) };
(: 29:) 
(: 30:) let $docname := "heidelberger_tageblatt"
(: 31:) let $ga := doc("/db/resources/digiZeitung/" || $docname || ".xml")
(: 32:) let $ga_div := $ga/mets:mets/mets:structMap[@TYPE="LOGICAL"]/mets:div[1]
(: 33:)     (:  let $gaDiv2 := $gaDiv/@* -- will nicht: https://stackoverflow.com/questions/3026038/how-to-get-node-without-children-in-xquery :)
(: 34:) let $ga_dmdids := tokenize( data($ga_div/@DMDID), '\s+')
(: 35:) let $ga_dmdsec := $ga/mets:mets/mets:dmdSec[@ID=$ga_dmdids]/mets:mdWrap/mets:xmlData/mods:mods/../../..
(: 36:) let $ga_mods := $ga_dmdsec/mets:mdWrap/mets:xmlData/mods:mods
(: 37:) let $ga_lang := $ga_mods/mods:language
(: 38:) let $ga_recordIdentifier_urn := data($ga_mods/mods:recordInfo/mods:recordIdentifier[@source="urn"])
(: 39:) let $ga_url := local:url($ga_div)
(: 40:) let $ga_projectname := local:projectname($ga_div)
(: 41:) 
(: 42:)     (: ==================================================================================== :)
(: 43:) for $mptr in $ga/mets:mets/mets:structMap[@TYPE="LOGICAL"]//mets:div/mets:mptr
(: 44:) let $href := data($mptr/@xlink:href) (: bloß keine [] um @xlink:href ! :)
(: 45:) let $projectname :=tokenize($href, '/')[5]
(: 46:) let $band := doc('/db/resources/digiZeitung/' || $projectname || '.xml')
(: 47:) let $top := $band/mets:mets/mets:structMap[@TYPE="LOGICAL"]//mets:div[not(mets:mptr)][1]
(: 48:) let $amdsec := $band/mets:mets/mets:amdSec[@ID=$top/@ADMID]
(: 49:) let $ga_href := data($band/mets:mets/mets:structMap[@TYPE="LOGICAL"]/mets:div/mets:mptr/@xlink:href)
(: 50:) 
(: 51:)     (: ==================================================================================== :)
(: 52:) for $issue in $band/mets:mets/mets:structMap[@TYPE="LOGICAL"]//mets:div[@TYPE="issue"]
(: 53:) where local:issue-date($issue)
(: 54:) let $dmdids := $issue ! data(@DMDID) ! tokenize(., '\s+')
(: 55:) let $dmdsec := $band/mets:mets/mets:dmdSec[@ID=$dmdids]/mets:mdWrap/mets:xmlData/mods:mods/mods:originInfo/mods:dateIssued[@encoding="iso8601"]/../../../../..
(: 56:) let $subdmdids := $issue//mets:div[@DMDID] ! data(@DMDID) ! tokenize(., '\s+')
(: 57:)     (: TODO »//@DMDID« geht NICHT bei mehreren @DMDIDs :)
(: 58:)     (:  let $dmdid_query := string-join(tokenize($dmdids, '\s+'), " or ") ... map ...
(: 59:)         XQuery / Walmsley / 2nd Ed.: XQuery does not provide any build-in
(: 60:)         support for evaluating dynamic paths
(: 61:)     :)
(: 62:) let $mods := $band/mets:mets/mets:dmdSec[@ID=$dmdids]/mets:mdWrap/mets:xmlData/mods:mods
(: 63:)     (: let $dmd := $mods/../../.. :)
(: 64:) let $subdmdsecs := $band/mets:mets/mets:dmdSec[@ID=$subdmdids]/mets:mdWrap/mets:xmlData/mods:mods/../../..
(: 65:) let $date := data($mods/mods:originInfo[@eventType='publication']/mods:dateIssued[@keyDate="yes" and @encoding='iso8601'][1])
(: 66:) let $y := fn:year-from-date($date)
(: 67:) let $ym := $y || "-" || fn:format-number(fn:month-from-date($date), "00")
(: 68:) let $ymd := $ym || "-" || fn:format-number(fn:day-from-date($date), "00")
(: 69:) 
(: 70:)     (: Beispiel <mets:div ID="log00004" TYPE="issue" DMDID="dmd00004" LABEL="04.01.1900"> :)
(: 71:) let $id := data($issue/@ID)
(: 72:) let $smLi := $band/mets:mets/mets:structLink/mets:smLink[@xlink:from=$id]
(: 73:) let $phy := $smLi/@xlink:to
(: 74:) let $smp := $band/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@ID=$phy]
(: 75:) let $fid := data($smp/mets:fptr/@FILEID) (: sonst ist $fid »@FILEID=...« und nachfolgendes »[@ID=$fid]« benutzt keinen Index :)
(: 76:) (:  let $fs := $band/mets:mets/mets:fileSec/mets:fileGrp/mets:file[@ID=$fid]  :)
(: 77:) let $fs := 
(: 78:)         for $fg in $band/mets:mets/mets:fileSec/mets:fileGrp[mets:file[@ID=$fid]]
(: 79:)         return <mets:fileGrp USE="{$fg/@USE}">
(: 80:)         {
(: 81:)             $fg/mets:file[@ID=$fid]
(: 82:)         }
(: 83:)         </mets:fileGrp>
(: 84:) return 
(: 85:)     <oai-pmh:OAI-PMH xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
(: 86:)     <oai-pmh:GetRecord> 
(: 87:)     <oai-pmh:record>
(: 88:)     <oai-pmh:header>
(: 89:)         <oai-pmh:identifier>{$projectname || "--" || $issue/@ID}</oai-pmh:identifier>
(: 90:)     </oai-pmh:header>
(: 91:)     <oai-pmh:metadata>
(: 92:)     <mets:mets>
(: 93:)         {$amdsec}
(: 94:)         {$ga_dmdsec}
(: 95:)         <mets:dmdSec ID="{$dmdsec/@ID}">
(: 96:)             <mets:mdWrap MDTYPE="MODS">
(: 97:)                 <mets:xmlData>
(: 98:)                     <mods:mods>
(: 99:)                         {insert-before(
(:100:)                             insert-before((:language_01:)$dmdsec/mets:mdWrap/mets:xmlData/mods:mods/*,
(:101:)                                 1, $ga_lang),
(:102:)                             1,
(:103:)                             <mods:recordInfo>
(:104:)                                 <mods:recordIdentifier source="urn">
(:105:)                                     {$ga_recordIdentifier_urn || "--" || $ymd || "--" || $issue/@ID}
(:106:)                                 </mods:recordIdentifier>
(:107:)                             </mods:recordInfo>
(:108:)                         )}
(:109:)                     </mods:mods>
(:110:)                 </mets:xmlData>
(:111:)             </mets:mdWrap>
(:112:)         </mets:dmdSec>
(:113:)         {$subdmdsecs}
(:114:)                         
(:115:)         <mets:structMap TYPE="LOGICAL">
(:116:)             <mets:div ID="{$ga_projectname}" TYPE="newspaper" DMDID="{$ga_dmdids}"><!-- possible ID collision with issue -->
(:117:)                 <mets:mptr LOCTYPE="URL" xlink:href="{$ga_href}"/>
(:118:)                 <mets:div ID="{$ga_projectname || "--" || $y}" TYPE="year" ORDERLABEL="{$y}">
(:119:)                     <mets:mptr LOCTYPE="URL" xlink:href="{$ga_href || "/" || $y}"/>
(:120:)                     <mets:div ID="{$ga_projectname || "--" || $ym}" TYPE="month" ORDERLABEL="{$ym}">
(:121:)                         <mets:div ID="{$ga_projectname || "--" || $ymd}" TYPE="day" ORDERLABEL="{$ymd}">
(:122:)                             <mets:div ID="{(: $projectname || "--" || → NEIN, weil sonst smLink nicht passt :) $issue/@ID}" DMDID="{$issue/@DMDID}" TYPE="issue" />
(:123:)                         </mets:div>
(:124:)                     </mets:div>
(:125:)                 </mets:div>
(:126:)             </mets:div>
(:127:)         </mets:structMap>
(:128:)         <mets:structMap TYPE="PHYSICAL">
(:129:)         <mets:div ID="phys0" TYPE="physSequence">
(:130:)             {$smp}
(:131:)         </mets:div>
(:132:)         </mets:structMap>
(:133:)         <mets:structLink>
(:134:)             <mets:smLink xlink:from="{$id}" xlink:to="phys0" />
(:135:)             {$smLi}
(:136:)         </mets:structLink>
(:137:)         <mets:fileSec>
(:138:)         {$fs}
(:139:)         </mets:fileSec>
(:140:)     </mets:mets>
(:141:)     </oai-pmh:metadata>
(:142:)     </oai-pmh:record>
(:143:)     </oai-pmh:GetRecord>
(:144:)     </oai-pmh:OAI-PMH>

The query works, but is very slow.

Monex says "No index", but e. g. line 72 has a "new range" index entry for @xlink:from in corresponding collection.xconf and the files in eXist-db data directory data/range contain strings of this attribute.

collection.xconf:

<collection xmlns="http://exist-db.org/collection-config/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <index xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink">
        <!-- Range indexes -->
        <range>
            <create qname="@ADMID" type="xs:string"/>
            <create qname="@DMDID" type="xs:string"/>
            <create qname="@encoding" type="xs:string"/>
            <create qname="@eventType" type="xs:string"/>
            <create qname="@FILEID" type="xs:string"/>
            <create qname="@ID" type="xs:string"/>
            <create qname="@keyDate" type="xs:string"/>
            <create qname="@TYPE" type="xs:string"/>
            <create qname="@USE" type="xs:string"/>
            <create qname="@xlink:from" type="xs:string"/>
            <create qname="@xlink:href" type="xs:string"/>
            <create qname="@xlink:to" type="xs:string"/>
        </range>
    </index>
</collection>

Merged screenshots excerpts of Monex / XQuery / collection.xconf

So in short I expected eXist-db to use the "new range" index.

(Perhaps one could point me to a good debugging/logging facility to find out why eXist-db is not using the index)

  • `eq` instead of `=` does help, but is not possible in all cases due to _cardinality_. Tried `...[id($x)]`, but this does not match as expected – user1986384 May 23 '23 at 17:13
  • `id()` is for attribute `id`, **not** for attribute `ID` – user1986384 May 24 '23 at 12:50
  • 1
    in fact `id()` will _only_ work for `@xml:id` – line-o May 25 '23 at 09:34
  • Tried to ensure/check index use with `(# exist:force-index-use #) { ... }` but this does not work as expected: https://github.com/eXist-db/exist/issues/4942 – user1986384 May 25 '23 at 13:04
  • To aid in troubleshooting, I'd suggest reducing your report to the bare minimum source document and query to demonstrate the error, and to include (or link to) the document and index configuration. There's a lot more here than is needed to demonstrate the issue. (The XQSuite in the linked issue is a great way to express this.) – Joe Wicentowski May 25 '23 at 14:59
  • Tried BaseX: approx. 60..100 times faster than eXist-db. Only had to adapt path in `doc()` call and replaced `request:get-parameter()`. The rest of XQuery is 100% identical. Same results, but with BaseX a few redundant `xmlns:…=…`. Somewhat overhead with Perl-client-API, no overhead with Java-client-API. – user1986384 Jun 02 '23 at 09:53

0 Answers0