import puppeteer, { product } from "puppeteer"; import jsonfile from "jsonfile"; import cheerio from "cheerio"; import fs from "fs-extra"; import { getDomain } from "tldts"; // Import website modules import { FitnessKoerier, FitnessApparaat, BolCom, BeterSport, FitnessGeest, BodyAndFit, PerfectBody, Alternate, Expert, Silvergear } from "./modules/websites"; start(); // Globals let browser: puppeteer.Browser = null; let allProducts: Array = []; let allBlogUrls: Array = []; let urlsFileName = ""; async function start() { try { console.log(`---- ContentEggStockCrawler ----`); console.log(`Starting crawler browser..`); browser = await puppeteer.launch({ headless: true }); let params = process.argv.slice(2); urlsFileName = params[0].replace(/\.[^/.]+$/, ""); const urls = await jsonfile.readFile(`./${params[0]}`); console.log(`Loaded ${urls.length} urls from file! (${params[0]})`); console.log(`Initialization done!`); console.log(`------------------------------------- \n`); for (let url of urls) { try { allBlogUrls.push(url); await crawlBlogPage(url); } catch (error) { continue; } } await browser.close(); await generateReport(); } catch (error) { console.error(`An Error Occured!`, error); } } async function crawlBlogPage(url: string) { try { console.log(`Crawling blog page: ${url}`); // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Close page await page.close(); // Load html in cheerio object const $ = cheerio.load(html); let products: Array = []; if ($(".row-products").length >= 1) { console.log(`Detected ${$(".row-products").length} content egg row type products!`); $(".row-products .cegg-list-logo-title a").each((index, element) => { let productUrl = $(element).attr("href"); let name = $(element).html().trim(); products.push({ name: name, blogUrl: url, url: productUrl, domain: "", inStock: false, }); }); } if ($("div.egg-container.egg-item").length >= 1) { console.log(`Detected ${$(".row-products").length} content egg large type products!`); $("div.egg-container.egg-item").each((index, element) => { let productUrl = $(element).find(".cegg-btn-row a").first().attr("href"); let name = $(element).find("h2").first().html().trim(); products.push({ name: name, blogUrl: url, url: productUrl, domain: "", inStock: false, }); }); } console.log("Checking product stocks..."); for (let index in products) { try { let status = await crawlProductStock(products[index].url); products[index].domain = status[0]; products[index].url = status[1]; products[index].inStock = status[2]; if (products[index].inStock) { console.log(`✅ ${products[index].name} - ${products[index].domain}`); } else { console.log(`❌ ${products[index].name} - ${products[index].domain}`); } allProducts.push(products[index]); } catch (error) { console.log("Skipping product..."); continue; } } console.log(""); console.log(""); } catch (error) { console.log("Error while trying to crawl page! Skipping..."); } } async function crawlProductStock(url: string) { try { // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Load html in cheerio object const $ = cheerio.load(html); // Get domain from url const domain = getDomain(page.url()); // Close page await page.close(); switch (domain) { case "bol.com": return [domain, page.url(), await BolCom.check(html)]; case "fitnesskoerier.nl": return [domain, page.url(), await FitnessKoerier.check(html)]; case "fitnessapparaat.nl": return [domain, page.url(), await FitnessApparaat.check(html)]; case "betersport.nl": return [domain, page.url(), await BeterSport.check(html)]; case "fitness-geest.nl": return [domain, page.url(), await FitnessGeest.check(html)]; case "bodyandfit.com": return [domain, page.url(), await BodyAndFit.check(html)]; case "gorillasports.nl": return [domain, page.url(), await BodyAndFit.check(html)]; case "perfectbody.nl": return [domain, page.url(), await PerfectBody.check(html)]; case "alternate.nl": return [domain, page.url(), await Alternate.check(html)]; case "expert.nl": return [domain, page.url(), await Expert.check(html)]; case "bcc.nl": return [domain, page.url(), await Expert.check(html)]; case "silvergear.eu": return [domain, page.url(), await Silvergear.check(html)]; default: console.error(`-- ${domain} is not an available website module! Can't check stock!`); return [domain, page.url(), false]; } } catch (error) { console.error(error); console.log("Error while trying to crawl page! Skipping..."); } } async function generateReport() { console.log("Generating report..."); let file = `${urlsFileName}-report.txt`; let totalProducts = 0; let totalOutStock = 0; for (let product of allProducts) { totalProducts++; if (product.inStock == false) { totalOutStock++; } } await fs.appendFile(file, `----------------------------------------\n`); await fs.appendFile(file, `Content Egg Product Stock Crawler Report\n`); await fs.appendFile(file, `Total Blog Urls: ${allBlogUrls.length}\n`); await fs.appendFile(file, `Total Products: ${totalProducts}\n`); await fs.appendFile(file, `Total Out of Stock: ${totalOutStock}\n`); await fs.appendFile(file, `----------------------------------------\n`); await fs.appendFile(file, ``); let lastBlogUrl = ""; for (let product of allProducts) { if (product.blogUrl != lastBlogUrl) { await fs.appendFile(file, `\n\n`); await fs.appendFile(file, `${product.blogUrl}\n`); await fs.appendFile(file, `----------------------------------\n`); } if (product.inStock == false) { await fs.appendFile(file, `${product.domain} - ${product.name} - ${product.url}\n`); } lastBlogUrl = product.blogUrl; } console.log("Report generated!"); }