import jsonfile from "jsonfile"; import cheerio, { CheerioAPI } from "cheerio"; import fs from "fs"; import { getDomain } from "tldts"; import { Browser, launch } from "puppeteer"; import axios from "axios"; import { CoolBlue, Dectdirect, Dobey, EP, LDJsonParser, Macrovet, Mediamarkt } from "./modules/websites"; import { createObjectCsvWriter } from "csv-writer"; import { CsvWriter } from "csv-writer/src/lib/csv-writer"; import { ObjectMap } from "csv-writer/src/lib/lang/object"; interface IProduct { domain: string; product_name: string; product_url: string; in_stock: boolean; blog_url: string; }; // Globals let browser: Browser = null; let sitemapUrl: string = ""; let writer: CsvWriter> = null; // debugStart(); start(); async function debugStart() { browser = await launch({ headless: false }); console.log(await crawlProductStock("https://www.dectdirect.nl/nl/unifi-switch-lite-8-poe-115502613.html")); // console.log(await crawlProductStock("https://www.mediamarkt.nl/nl/product/_apple-iphone-14-128gb-midnight-1738479.html")); // console.log(await crawlProductStock("https://www.coolblue.nl/product/923036/hp-deskjet-2720e-all-in-one.html?clickref=1101lwW9ebAE&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1101lwW9ebAE&utm_term=1100leWsm&ref=293530&PHGref=1101lwW9ebAE&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1101lwW9ebAE")); // console.log(await crawlProductStock("https://www.coolblue.nl/product/882996/canon-pixma-ts-3450-zwart.html?clickref=1011lwWDBFEL&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1011lwWDBFEL&utm_term=1100leWsm&ref=293530&PHGref=1011lwWDBFEL&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1011lwWDBFEL")); } async function start() { try { console.log(`---- ContentEggStockCrawler ----`); const urls: Array = []; const sitemapUrls: Array = []; // Get sitemap url from process params let params = process.argv.slice(2); sitemapUrl = params[0]; let $: CheerioAPI = null; try { // Crawl Sitemap console.log(`Crawling input sitemap: ${sitemapUrl}`); const sitemapRequest = await axios.get(sitemapUrl); $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); } catch (error) { console.log("[ERROR] Error occurred crawling main sitemap.") process.exit(0); } $('loc').each(function() { const url = $(this).text(); if (url.endsWith(".xml")) { if (!sitemapUrls.includes(url)) { console.log("Found sitemap url: " + url); sitemapUrls.push(url); } } else { if (!urls.includes(url)) { urls.push(url); } } }); for (const sitemapUrl of sitemapUrls) { console.log(`Crawling sitemap: ${sitemapUrl}`); const sitemapRequest = await axios.get(sitemapUrl); const $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); $('loc').each(function() { const url = $(this).text(); if (!urls.includes(url)) { urls.push(url); } }); } console.log(`Found ${urls.length} urls from sitemaps.`); browser = await launch({ headless: true }); console.log(`Initialized puppeteer browser.`); const today: Date = new Date(); const datecode: string = `${today.getFullYear()}-${today.getMonth() + 1}-${today.getDate()}`; const domain: string = getDomain(sitemapUrl); const file = `${domain}_${datecode}_report.csv`; writer = createObjectCsvWriter({ path: file, header: [ {id: 'domain', title: 'WEBSHOP_DOMAIN'}, {id: 'product_name', title: 'PRODUCT_NAME'}, {id: 'product_url', title: 'PRODUCT_URL'}, {id: 'in_stock', title: 'IN_STOCK'}, {id: 'blog_url', title: 'BLOG_URL'} ], encoding: "utf8", recordDelimiter: "\r\n", fieldDelimiter: ",", alwaysQuote: true, }); console.log(`Initialized csv writer.`); console.log(`Initialization done.`); console.log(`------------------------------------- \n`); for (const url of urls) { try { await crawlUrl(url); } catch (error) { continue; } } await browser.close(); console.log(`Finished crawling all urls. Saved report to ${file}`); } catch (error) { console.error(`A error occurred!`, error); } } async function crawlUrl(url: string) { try { console.log(`Crawling url: ${url}`); // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.documentElement.outerHTML); // Close page await page.close(); // Load html in cheerio object const $ = cheerio.load(html); const products: Array = []; if ($(".row-products").length >= 1) { console.log(`- Parsed ${$(".row-products").length} content egg row type products.`); $(".row-products .cegg-list-logo-title a").each((index, element) => { const productUrl = $(element).attr("href"); const name = $(element).html().trim(); products.push({ domain: "", product_name: name, product_url: productUrl, blog_url: url, in_stock: false, }); }); } if ($("div.egg-container.egg-item").length >= 1) { console.log(`- Parsed ${$("div.egg-container.egg-item").length} content egg large type products.`); $("div.egg-container.egg-item").each((index, element) => { const productUrl = $(element).find(".cegg-btn-row a").first().attr("href"); const name = $(element).find("h2").first().html().trim(); products.push({ domain: "", product_name: name, product_url: productUrl, blog_url: url, in_stock: false, }); }); } console.log("- Checking product stocks..."); for (const index in products) { try { const status = await crawlProductStock(products[index].product_url); products[index].domain = status[0]; products[index].product_url = status[1]; products[index].in_stock = status[2]; if (products[index].in_stock) { console.log(` [IN STOCK] ${products[index].product_name} - ${products[index].domain}`); } else { console.log(` [OUT OF STOCK] ${products[index].product_name} - ${products[index].domain}`); } // Write to csv await writer.writeRecords([{ domain: products[index].domain, product_name: products[index].product_name, product_url: products[index].product_url, in_stock: products[index].in_stock, blog_url: products[index].blog_url, }]); await wait(1000); } catch (error) { console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error); continue; } } console.log(""); console.log(""); } catch (error) { console.log("-- Error while trying to crawl page! Skipping..."); } } async function crawlProductStock(url: string): Promise<[string, string, boolean]> { try { // Open new page and goto url const page = await browser.newPage(); page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.documentElement.outerHTML); // Get domain from url const domain = getDomain(page.url()); // Close page await page.close(); switch (domain) { case "bol.com": return [domain, page.url(), await LDJsonParser.check(html)]; case "petsplace.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "dobey.nl": return [domain, page.url(), await Dobey.check(html)]; case "brekz.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "hondenbed.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "petsonline.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "coolblue.nl": return [domain, page.url(), await CoolBlue.check(html)]; case "bcc.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "azerty.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "cameranu.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "ep.nl": return [domain, page.url(), await EP.check(html)]; case "alternate.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "macrovet.nl": return [domain, page.url(), await Macrovet.check(html)]; case "ezydog.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "mediamarkt.nl": return [domain, page.url(), await Mediamarkt.check(html)]; case "dectdirect.nl": return [domain, page.url(), await Dectdirect.check(html)]; case "expert.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "maxiaxi.com": return [domain, page.url(), await LDJsonParser.check(html)]; default: console.error(`-- ${domain} is not an supported website! Cannot check stock!`); return [domain, page.url(), false]; } } catch (error) { console.error(error); console.log("-- Error while trying to crawl page! Skipping..."); } } function wait(ms: number) { return new Promise(resolve => setTimeout(resolve, ms)); }