/** * Copyright 2018 Google Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @author ebidel@ (Eric Bidelman) */ /** * Discovers all the pages in site or single page app (SPA) and creates * a tree of the result in ./output/ { const currentPath = parentPath + dirName; if (!fs.existsSync(currentPath)) { fs.mkdirSync(currentPath); } return currentPath + '/'; }, ''); } catch (err) { if (err.code !== 'EEXIST') { throw err; } } } /** * Finds all anchors on the page, inclusive of those within shadow roots. * Note: Intended to be run in the context of the page. * @param {boolean=} sameOrigin When true, only considers links from the same origin as the app. * @return {!Array} List of anchor hrefs. */ function collectAllSameOriginAnchorsDeep(sameOrigin = true) { const allElements = []; const findAllElements = function(nodes) { for (let i = 0, el; el = nodes[i]; ++i) { allElements.push(el); // If the element has a shadow root, dig deeper. if (el.shadowRoot) { findAllElements(el.shadowRoot.querySelectorAll('*')); } } }; findAllElements(document.querySelectorAll('*')); const filtered = allElements .filter(el => el.localName === 'a' && el.href) // element is an anchor with an href. .filter(el => el.href !== location.href) // link doesn't point to page's own URL. .filter(el => { if (sameOrigin) { return new URL(location).origin === new URL(el.href).origin; } return true; }) .map(a => a.href); return Array.from(new Set(filtered)); } /** * Crawls a URL by visiting an url, then recursively visiting any child subpages. * @param {!Browser} browser * @param {{url: string, title: string, img?: string, children: !Array}} page Current page. * @param {number=} depth Current subtree depth of crawl. */ async function crawl(browser, page, depth = 0) { if (depth > maxDepth) { return; } // If we've already crawled the URL, we know its children. if (crawledPages.has(page.url)) { console.log(`Reusing route: ${page.url}`); const item = crawledPages.get(page.url); page.title = item.title; page.img = item.img; page.children = item.children; // Fill in the children with details (if they already exist). page.children.forEach(c => { const item = crawledPages.get(c.url); c.title = item ? item.title : ''; c.img = item ? item.img : null; }); return; } else { console.log(`Loading: ${page.url}`); const newPage = await browser.newPage(); await newPage.goto(page.url, {waitUntil: 'networkidle2'}); let anchors = await newPage.evaluate(collectAllSameOriginAnchorsDeep); anchors = anchors.filter(a => a !== URL) // link doesn't point to start url of crawl. page.title = await newPage.evaluate('document.title'); page.children = anchors.map(url => ({url})); if (SCREENSHOTS) { const path = `./${OUT_DIR}/${slugify(page.url)}.png`; let imgBuff = await newPage.screenshot({fullPage: false}); imgBuff = await sharp(imgBuff).resize(null, 150).toBuffer(); // resize image to 150 x auto. util.promisify(fs.writeFile)(path, imgBuff); // async page.img = `data:img/png;base64,${imgBuff.toString('base64')}`; } crawledPages.set(page.url, page); // cache it. await newPage.close(); } // Crawl subpages. for (const childPage of page.children) { await crawl(browser, childPage, depth + 1); } } (async() => { mkdirSync(OUT_DIR); // create output dir if it doesn't exist. await del([`${OUT_DIR}/*`]); // cleanup after last run. const browser = await puppeteer.launch(); const page = await browser.newPage(); if (VIEWPORT) { await page.setViewport(VIEWPORT); } const root = {url: URL}; await crawl(browser, root); await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); await browser.close(); })();