0

i am currently trying to edit a pdf with hummus.js to replace certain placeholder characters in the pdf with others. This works perfectly fine for almost every page, but for certain pages I get the following error:

TypeError: pageObject.getDictionary(...).toJSObject(...).Contents.getObjectID is not a function for: const textObjectID = pageObject.getDictionary().toJSObject().Contents.getObjectID();

Whenever that error happens and it doesnt work, pageObject.getDictionary().toJSObject().Contents returns a PDFArray {}, while, when it works, it returns an PDFIndirectObjectReference {}

Does anybody know how I could solve this issue or any other way to edit a pdf that way (replacing certain placeholder characters with others)

UPDATE 1: I can now differentiate between IndirectObjectReferences and arrays containing IndirectObjectReferences, but for some reason it removes all text but keeps the images from pdf pages which i access through these arrays, the rest of the pdf pages still work perfectly fine

async function replacetext(filePath, callback) {

        var pageCount = 0;
        const modPdfWriter = hummus.createWriterToModify(filePath+'.pdf', { modifiedFilePath: `${filePath}-modified.pdf`, compress: false })
        const numPages = modPdfWriter.createPDFCopyingContextForModifiedFile().getSourceDocumentParser().getPagesCount()
        var log="";
        for (let page = 0; page < numPages; page++) {
            const copyingContext = modPdfWriter.createPDFCopyingContextForModifiedFile()
            const objectsContext = modPdfWriter.getObjectsContext()
    
            const pageObject = copyingContext.getSourceDocumentParser().parsePage(page)
            pageCount = pageCount + 1;
            
    
    
            try
            {
                var replaceTextRight = "        " + pageCount.toString();
    
                if(pageCount > 9 && pageCount < 100)
                {
                    replaceTextRight = "       " + pageCount.toString();
    
                }
                else if(pageCount > 99)
                {
                    replaceTextRight = "      " + pageCount.toString();
                    
                }
    
                const textStream = copyingContext.getSourceDocumentParser().queryDictionaryObject(pageObject.getDictionary(), 'Contents')
                console.log(textStream);
                if(textStream.toString() == 'Array')
                {
                    let data = []
                    streamArray = textStream.toJSArray();
                    for(var i = 0; i<streamArray.length; i++)
                    {
                        var streamo = copyingContext.getSourceDocumentParser().queryArrayObject(textStream, i);
                        const readStream = copyingContext.getSourceDocumentParser().startReadingFromStream(streamo)
                        while (readStream.notEnded()) {
                            const readData = readStream.read(10000)
                            data = data.concat(readData)
                
                        }
                        var redactedPdfPageAsString = new Buffer.from(data).toString();
    
                        var replacedBuffer = replace(redactedPdfPageAsString, "12345", pageCount.toString());
                        log = log+" \n Replaced " +page +" with "  + pageCount.toString();
                      
                        var replacedBuffer = replace(replacedBuffer, "67890",  replaceTextRight);
            
                        log = log+" \n Replaced " +page +" with "  + replaceTextRight;
             
                        var textObjectID = streamArray[i].getObjectID();
            
                        objectsContext.startModifiedIndirectObject(textObjectID)
                
                        var stream = objectsContext.startUnfilteredPDFStream();
                        stream.getWriteStream().write(strToByteArray(replacedBuffer));
                        objectsContext.endPDFStream(stream);
                
                        objectsContext.endIndirectObject();
                        
                    }
                }
    
    
                else
                {
                let data = []
                const readStream = copyingContext.getSourceDocumentParser().startReadingFromStream(textStream)
                while (readStream.notEnded()) {
                    const readData = readStream.read(10000)
                    data = data.concat(readData)
        
                }
    
                var redactedPdfPageAsString = new Buffer.from(data).toString();
    
                var replacedBuffer = replace(redactedPdfPageAsString, "12345", pageCount.toString());
                log = log+" \n Replaced " +page +" with "  + pageCount.toString();
              
                var replacedBuffer = replace(replacedBuffer, "67890",  replaceTextRight);
    
                log = log+" \n Replaced " +page +" with "  + replaceTextRight;
     
                const textObjectID = pageObject.getDictionary().toJSObject().Contents.getObjectID();
                var content = pageObject.getDictionary().toJSObject().Contents;
    
             
                objectsContext.startModifiedIndirectObject(textObjectID)
        
                const stream = objectsContext.startUnfilteredPDFStream();
                stream.getWriteStream().write(strToByteArray(replacedBuffer));
                objectsContext.endPDFStream(stream);
        
                objectsContext.endIndirectObject();
            }
        }
            catch(e){
                console.log(e);
    
            } 
        }
    
         fs.writeFile('C:\\Users\\Administrator\\Downloads\\PDFModifyDebugging\\PDFWRITELOG.txt', log, function (err) {
                    if (err) return console.log(err);
                    console.log('written in file');
                  });      
        modPdfWriter.end()
    
        hummus.recrypt(`${filePath}-modified.pdf`, filePath)
        callback();
    }
KevGi_AWSi
  • 41
  • 5
  • The **Contents** of a page dictionary may be either an always indirect stream object _or a direct or indirect array object containing indirect stream objects_. Your code only considers the case of an immediate, indirect stream object. – mkl Jan 25 '21 at 11:17
  • Do you know how i can access the indirect stream objects in these arrays? Couldnt find that on the hummus.js doc. If i loop through the keys of the array it doesnt return any – KevGi_AWSi Jan 26 '21 at 08:55
  • Sorry, don't know much about hummus or Javascript, merely a bit about PDF. Thus, I could recognize that your code does not cover all possible cases, but I don't know how exactly to cover them with hummus and Javascript. – mkl Jan 26 '21 at 12:07
  • Thank you, you were already a big help, thanks to that I can atleast now differentiate between different contents, but now im stuck at another problem, maybe you know why it happens and how i could solve it: For some reason the new solution removes all text but keeps the images from pdf pages which i access through these arrays, the rest of the pdf pages still work perfectly fine – KevGi_AWSi Jan 28 '21 at 13:52
  • Ok i solved it, i initialized data outside the loop which so it couldnt work :) Thank you again for your help! – KevGi_AWSi Jan 28 '21 at 13:58

0 Answers0