import puppeteer, { product } from "puppeteer"; import jsonfile from "jsonfile"; import cheerio from "cheerio"; import fs from "fs-extra"; import { getDomain } from "tldts"; import got from "got"; // Import website modules import { FitnessKoerier, FitnessApparaat, BolCom, BeterSport, FitnessGeest, BodyAndFit, PerfectBody, Alternate, Expert, Silvergear, AlsaNature, Dobey, EzyDog, K9Shop, OnlineHondenSpeciaalZaak, MacroVet, PetDuka, ZooEnzo, PetsOnline, PetsPlace, ThePetEmpire, Conrad, HuisdierExpress, HondenBed, Coolblue, WifiMedia, Ep } from "./modules/websites"; // Globals let browser: puppeteer.Browser = null; let sitemapUrl: string = ""; let allProducts: Array = []; let allBlogUrls: Array = []; // debugStart(); start(); async function debugStart() { browser = await puppeteer.launch({ headless: false }); console.log(await crawlProductStock("https://www.ep.nl/products/sony-kd-l32we610-hd-led-tv/27658/")); } async function start() { try { console.log(`---- ContentEggStockCrawler ----`); const urls: Array = []; // Get sitemap url from process params let params = process.argv.slice(2); sitemapUrl = params[0]; // Crawl Sitemap const sitemap = await got(sitemapUrl); const $ = cheerio.load(sitemap.body, { xmlMode: true }) $('loc').each(function() { const url = $(this).text() if (!urls.includes(url)) { urls.push(url) } }); console.log(`Loaded ${urls.length} urls from sitemap! (${params[0]})`); console.log(`Starting crawler browser..`); browser = await puppeteer.launch({ headless: true }); console.log(`Initialization done!`); console.log(`------------------------------------- \n`); for (let url of urls) { try { allBlogUrls.push(url); await crawlBlogPage(url); } catch (error) { continue; } } await browser.close(); await generateReport(); } catch (error) { console.error(`An Error Occured!`, error); } } async function crawlBlogPage(url: string) { try { console.log(`Crawling blog page: ${url}`); // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Close page await page.close(); // Load html in cheerio object const $ = cheerio.load(html); let products: Array = []; if ($(".row-products").length >= 1) { console.log(`Detected ${$(".row-products").length} content egg row type products!`); $(".row-products .cegg-list-logo-title a").each((index, element) => { let productUrl = $(element).attr("href"); let name = $(element).html().trim(); products.push({ name: name, blogUrl: url, url: productUrl, domain: "", inStock: false, }); }); } if ($("div.egg-container.egg-item").length >= 1) { console.log(`Detected ${$(".div.egg-container.egg-item").length} content egg large type products!`); $("div.egg-container.egg-item").each((index, element) => { let productUrl = $(element).find(".cegg-btn-row a").first().attr("href"); let name = $(element).find("h2").first().html().trim(); products.push({ name: name, blogUrl: url, url: productUrl, domain: "", inStock: false, }); }); } console.log("Checking product stocks..."); for (let index in products) { try { let status = await crawlProductStock(products[index].url); products[index].domain = status[0]; products[index].url = status[1]; products[index].inStock = status[2]; if (products[index].inStock) { console.log(`[IN STOCK] ${products[index].name} - ${products[index].domain}`); } else { console.log(`[OUT OF STOCK] ${products[index].name} - ${products[index].domain}`); } allProducts.push(products[index]); } catch (error) { console.log("Skipping product..."); continue; } } console.log(""); console.log(""); } catch (error) { console.log("Error while trying to crawl page! Skipping..."); } } async function crawlProductStock(url: string) { try { // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Load html in cheerio object const $ = cheerio.load(html); // Get domain from url const domain = getDomain(page.url()); // Close page await page.close(); switch (domain) { case "bol.com": return [domain, page.url(), await BolCom.check(html)]; case "coolblue.nl": return [domain, page.url(), await Coolblue.check(html)]; case "fitnesskoerier.nl": return [domain, page.url(), await FitnessKoerier.check(html)]; case "fitnessapparaat.nl": return [domain, page.url(), await FitnessApparaat.check(html)]; case "betersport.nl": return [domain, page.url(), await BeterSport.check(html)]; case "fitness-geest.nl": return [domain, page.url(), await FitnessGeest.check(html)]; case "bodyandfit.com": return [domain, page.url(), await BodyAndFit.check(html)]; case "gorillasports.nl": return [domain, page.url(), await BodyAndFit.check(html)]; case "perfectbody.nl": return [domain, page.url(), await PerfectBody.check(html)]; case "alternate.nl": return [domain, page.url(), await Alternate.check(html)]; case "expert.nl": return [domain, page.url(), await Expert.check(html)]; case "bcc.nl": return [domain, page.url(), await Expert.check(html)]; case "silvergear.eu": return [domain, page.url(), await Silvergear.check(html)]; case "conrad.com": return [domain, page.url(), await Conrad.check(html)]; case "conrad.nl": return [domain, page.url(), await Conrad.check(html)]; case "alsa-nature.nl": return [domain, page.url(), await AlsaNature.check(html)]; case "dobey.nl": return [domain, page.url(), await Dobey.check(html)]; case "ezydog.nl": return [domain, page.url(), await EzyDog.check(html)]; case "k9shop.nl": return [domain, page.url(), await K9Shop.check(html)]; case "onlinehondenspeciaalzaak.nl": return [domain, page.url(), await OnlineHondenSpeciaalZaak.check(html)]; case "macrovet.nl": return [domain, page.url(), await MacroVet.check(html)]; case "petduka.nl": return [domain, page.url(), await PetDuka.check(html)]; case "petsonline.nl": return [domain, page.url(), await PetsOnline.check(html)]; case "petsplace.nl": return [domain, page.url(), await PetsPlace.check(html)]; case "zoo-enzo.nl": return [domain, page.url(), await ZooEnzo.check(html)]; case "thepetempire.com": return [domain, page.url(), await ThePetEmpire.check(html)]; case "huisdierexpress.nl": return [domain, page.url(), await HuisdierExpress.check(html)]; case "hondenbed.nl": return [domain, page.url(), await HondenBed.check(html)]; case "wifimedia.eu": return [domain, page.url(), await WifiMedia.check(html)]; case "ep.nl": return [domain, page.url(), await Ep.check(html)]; default: console.error(`-- ${domain} is not an available website module! Can't check stock!`); return [domain, page.url(), false]; } } catch (error) { console.error(error); console.log("Error while trying to crawl page! Skipping..."); } } async function generateReport() { console.log("Generating report..."); let today: Date = new Date(); let datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString(); let domain: string = getDomain(sitemapUrl); let file = `${domain}_${datecode}_report.txt`; let totalProducts = 0; let totalOutStock = 0; for (let product of allProducts) { totalProducts++; if (product.inStock == false) { totalOutStock++; } } await fs.appendFile(file, `----------------------------------------\n`); await fs.appendFile(file, `Content Egg Product Stock Crawler Report\n`); await fs.appendFile(file, `Total Blog Urls: ${allBlogUrls.length}\n`); await fs.appendFile(file, `Total Products: ${totalProducts}\n`); await fs.appendFile(file, `Total Out of Stock: ${totalOutStock}\n`); await fs.appendFile(file, `----------------------------------------\n`); await fs.appendFile(file, ``); let lastBlogUrl = ""; for (let product of allProducts) { if (product.blogUrl != lastBlogUrl) { await fs.appendFile(file, `\n\n`); await fs.appendFile(file, `${product.blogUrl}\n`); await fs.appendFile(file, `----------------------------------\n`); } if (product.inStock == false) { await fs.appendFile(file, `${product.domain} - ${product.name} - ${product.url}\n`); } lastBlogUrl = product.blogUrl; } console.log("Report generated!"); }