Extract all URL links out of Google Slides presentation with Google Apps Script

Question

I am trying to create a function that, when passed a Google Slides presentation ID, can parse the presentation and write all URL links it finds to a Google Sheet. I have built adapted the following function to do the same for a Google Docs document input based off of this answer from @Yuval

function getAllLinks(docId, mergeAdjacent) {
  var links = [];

  //var doc = DocumentApp.getActiveDocument();
  var doc = DocumentApp.openById(docId);
  var parentDocName = doc.getName();
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Extracted Links');

  iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
    if (!("getParagraphs" in section)) {
      // as we're using some undocumented API, adding this to avoid cryptic
      // messages upon possible API changes.
      throw new Error("An API change has caused this script to stop " + 
                      "working.\n" +
                      "Section #" + sectionIndex + " of type " + 
                      section.getType() + " has no .getParagraphs() method. " +
        "Stopping script.");
    }

    section.getParagraphs().forEach(function(par) { 
      // skip empty paragraphs
      if (par.getNumChildren() == 0) {
        return;
      }

      // go over all text elements in paragraph / list-item
      for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
        if (el.getType() != DocumentApp.ElementType.TEXT) {
          continue;
        }

        // go over all styling segments in text element
        var attributeIndices = el.getTextAttributeIndices();
        var lastLink = null;
        attributeIndices.forEach(function(startOffset, i, attributeIndices) { 
          var url = el.getLinkUrl(startOffset);

          if (url != null) {
            // we hit a link
            var endOffsetInclusive = (i+1 < attributeIndices.length? 
                                      attributeIndices[i+1]-1 : null);

            // check if this and the last found link are continuous
            if (mergeAdjacent && lastLink != null && lastLink.url == url && 
                  lastLink.endOffsetInclusive == startOffset - 1) {
              // this and the previous style segment are continuous
              lastLink.endOffsetInclusive = endOffsetInclusive;
              return;
            }

            lastLink = {
              "section": section,
              "isFirstPageSection": isFirstPageSection,
              "paragraph": par,
              "textEl": el,
              "startOffset": startOffset,
              "endOffsetInclusive": endOffsetInclusive,
              "url": url
            };
            var row = sh.getLastRow() + 1;
            var r1=sh.getRange(row, 1);
            r1.setValue(parentDocName);
            var r2=sh.getRange(row, 2);
            r2.setValue(url);
            Logger.log(parentDocName)
            Logger.log(url)
            links.push(lastLink);
          }        
        });
      }
    });
  });


  return links;
}

/**
 * Calls the given function for each section of the document (body, header, 
 * etc.). Sections are children of the DocumentElement object.
 *
 * @param {Document} doc The Document object (such as the one obtained via
 *     a call to DocumentApp.getActiveDocument()) with the sections to iterate
 *     over.
 * @param {Function} func A callback function which will be called, for each
 *     section, with the following arguments (in order):
 *       - {ContainerElement} section - the section element
 *       - {Number} sectionIndex - the child index of the section, such that
 *         doc.getBody().getParent().getChild(sectionIndex) == section.
 *       - {Boolean} isFirstPageSection - whether the section is a first-page
 *         header/footer section.
 */
function iterateSections(doc, func) {
  // get the DocumentElement interface to iterate over all sections
  // this bit is undocumented API
  var docEl = doc.getBody().getParent();

  var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : 
                                   docEl.getChildIndex(doc.getHeader()));
  var regularFooterSectionIndex = (doc.getFooter() == null? -1 : 
                                   docEl.getChildIndex(doc.getFooter()));

  for (var i=0; i<docEl.getNumChildren(); ++i) {
    var section = docEl.getChild(i);

    var sectionType = section.getType();
    var uniqueSectionName;
    var isFirstPageSection = (
      i != regularHeaderSectionIndex &&
      i != regularFooterSectionIndex && 
      (sectionType == DocumentApp.ElementType.HEADER_SECTION ||
       sectionType == DocumentApp.ElementType.FOOTER_SECTION));

    func(section, i, isFirstPageSection);
  }
}

When I try to create the same for a Google Slides presentation as the input, I am getting stuck on the step of how to parse through the document and extract all of the text bits (in order to check them for links). It seems like I would need to use getSlides(), and then getPageElements() and iterate through those, but I am unclear on how to get to the actual text on the slides. Any tips on how to get iterate through the actual text on slides (and potentially how to extract the link URL out of that text if it has one) would be much appreciated. Thank you!

NightEye · Accepted Answer · 2021-08-17T19:36:34.553

4

If you just wan't to get the links from the slides, see the code below:

Code:

function getLinksFromSlides() {
  var presentation = SlidesApp.getActivePresentation();
  var slides = presentation.getSlides();
  // traverse each slide
  slides.forEach(function (slide) {
    var shapes = slide.getShapes();
    // traverse each shape
    shapes.forEach(function (shape) {
      // get its text content
      var textRange = shape.getText();
      var links = textRange.getLinks();
      // print all links found
      links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
    });
  });
}

Sample:

Output:

Note:

This only extracts the hyperlinks. It doesn't extract any links/url that isn't hyperlinked as shown in the sample data. (e.g. https://www.facebook.com)
If you want the non-hyperlink urls, then you might have to try regex.

edited Aug 17 '21 at 19:36

answered Aug 17 '21 at 19:17

NightEye

10,634
2
5
24

1

Thank you! I am also trying to do the same for a Google Sheets spreadsheet as the input. Do you know if `SpreadsheetApp` also has a similar `link` object? And if so do you know which method I would need to apply to a range to get out the links if I iterated through the spreadsheet with `getSheets()` and `getRange()`? – Gabriel Tero Aug 17 '21 at 20:17
This is a very different question from the post above, but I found a similar post that you can try. https://stackoverflow.com/a/62135134/14606045. If you have question regarding the link, please do post it in a different question. Noting here that `Range` class has [getRichTextValue](https://developers.google.com/apps-script/reference/spreadsheet/range#getrichtextvalue) and it does return `RichTextValue`. That does have link related methods like what I used above. That should provide you with a better understanding. For the iteration, check if you can use `getRichTextValue` on `getDataRange` – NightEye Aug 17 '21 at 20:27
By the way, this won't get tables I don't think. – IMTheNachoMan Jul 12 '23 at 02:03

Extract all URL links out of Google Slides presentation with Google Apps Script

1 Answers1

Code:

Sample:

Output:

Note: