I am trying to create a function that, when passed a Google Slides presentation ID, can parse the presentation and write all URL links it finds to a Google Sheet. I have built adapted the following function to do the same for a Google Docs document input based off of this answer from @Yuval
function getAllLinks(docId, mergeAdjacent) {
var links = [];
//var doc = DocumentApp.getActiveDocument();
var doc = DocumentApp.openById(docId);
var parentDocName = doc.getName();
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Extracted Links');
iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
if (!("getParagraphs" in section)) {
// as we're using some undocumented API, adding this to avoid cryptic
// messages upon possible API changes.
throw new Error("An API change has caused this script to stop " +
"working.\n" +
"Section #" + sectionIndex + " of type " +
section.getType() + " has no .getParagraphs() method. " +
"Stopping script.");
}
section.getParagraphs().forEach(function(par) {
// skip empty paragraphs
if (par.getNumChildren() == 0) {
return;
}
// go over all text elements in paragraph / list-item
for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
if (el.getType() != DocumentApp.ElementType.TEXT) {
continue;
}
// go over all styling segments in text element
var attributeIndices = el.getTextAttributeIndices();
var lastLink = null;
attributeIndices.forEach(function(startOffset, i, attributeIndices) {
var url = el.getLinkUrl(startOffset);
if (url != null) {
// we hit a link
var endOffsetInclusive = (i+1 < attributeIndices.length?
attributeIndices[i+1]-1 : null);
// check if this and the last found link are continuous
if (mergeAdjacent && lastLink != null && lastLink.url == url &&
lastLink.endOffsetInclusive == startOffset - 1) {
// this and the previous style segment are continuous
lastLink.endOffsetInclusive = endOffsetInclusive;
return;
}
lastLink = {
"section": section,
"isFirstPageSection": isFirstPageSection,
"paragraph": par,
"textEl": el,
"startOffset": startOffset,
"endOffsetInclusive": endOffsetInclusive,
"url": url
};
var row = sh.getLastRow() + 1;
var r1=sh.getRange(row, 1);
r1.setValue(parentDocName);
var r2=sh.getRange(row, 2);
r2.setValue(url);
Logger.log(parentDocName)
Logger.log(url)
links.push(lastLink);
}
});
}
});
});
return links;
}
/**
* Calls the given function for each section of the document (body, header,
* etc.). Sections are children of the DocumentElement object.
*
* @param {Document} doc The Document object (such as the one obtained via
* a call to DocumentApp.getActiveDocument()) with the sections to iterate
* over.
* @param {Function} func A callback function which will be called, for each
* section, with the following arguments (in order):
* - {ContainerElement} section - the section element
* - {Number} sectionIndex - the child index of the section, such that
* doc.getBody().getParent().getChild(sectionIndex) == section.
* - {Boolean} isFirstPageSection - whether the section is a first-page
* header/footer section.
*/
function iterateSections(doc, func) {
// get the DocumentElement interface to iterate over all sections
// this bit is undocumented API
var docEl = doc.getBody().getParent();
var regularHeaderSectionIndex = (doc.getHeader() == null? -1 :
docEl.getChildIndex(doc.getHeader()));
var regularFooterSectionIndex = (doc.getFooter() == null? -1 :
docEl.getChildIndex(doc.getFooter()));
for (var i=0; i<docEl.getNumChildren(); ++i) {
var section = docEl.getChild(i);
var sectionType = section.getType();
var uniqueSectionName;
var isFirstPageSection = (
i != regularHeaderSectionIndex &&
i != regularFooterSectionIndex &&
(sectionType == DocumentApp.ElementType.HEADER_SECTION ||
sectionType == DocumentApp.ElementType.FOOTER_SECTION));
func(section, i, isFirstPageSection);
}
}
When I try to create the same for a Google Slides presentation as the input, I am getting stuck on the step of how to parse through the document and extract all of the text bits (in order to check them for links). It seems like I would need to use getSlides()
, and then getPageElements()
and iterate through those, but I am unclear on how to get to the actual text on the slides. Any tips on how to get iterate through the actual text on slides (and potentially how to extract the link URL out of that text if it has one) would be much appreciated. Thank you!