import jsonfile from "jsonfile"; import cheerio from "cheerio"; import fs from "fs"; import { getDomain } from "tldts"; import { Browser, launch } from "puppeteer"; import axios from "axios"; import { Dobey, EP, LDJsonParser, Macrovet } from "./modules/websites"; import { createObjectCsvWriter } from "csv-writer"; import { CsvWriter } from "csv-writer/src/lib/csv-writer"; import { ObjectMap } from "csv-writer/src/lib/lang/object"; interface IProduct { domain: string; product_name: string; product_url: string; in_stock: boolean; blog_url: string; }; // Globals let browser: Browser = null; let sitemapUrl: string = ""; let writer: CsvWriter> = null; // debugStart(); start(); async function debugStart() { browser = await launch({ headless: false }); console.log(await crawlProductStock("https://www.hondenbed.nl/hondenkussen-taupe-bruin.html?utm_medium=affiliate&utm_source=tradetracker")); } async function start() { try { console.log(`---- ContentEggStockCrawler ----`); const urls: Array = []; const sitemapUrls: Array = []; // Get sitemap url from process params let params = process.argv.slice(2); sitemapUrl = params[0]; // Crawl Sitemap console.log(`Crawling input sitemap: ${sitemapUrl}`); const sitemapRequest = await axios.get(sitemapUrl); const $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); $('loc').each(function() { const url = $(this).text(); if (url.endsWith(".xml")) { if (!sitemapUrls.includes(url)) { console.log("Found sitemap url: " + url); sitemapUrls.push(url); } } else { if (!urls.includes(url)) { urls.push(url); } } }); for (const sitemapUrl of sitemapUrls) { console.log(`Crawling found sitemap: ${sitemapUrl}`); const sitemapRequest = await axios.get(sitemapUrl); const $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); $('loc').each(function() { const url = $(this).text(); if (!urls.includes(url)) { urls.push(url); } }); } console.log(`Crawled ${urls.length} urls from sitemaps.`); browser = await launch({ headless: true }); console.log(`Initialized puppeteer browser.`); const today: Date = new Date(); const datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString(); const domain: string = getDomain(sitemapUrl); const file = `${domain}_${datecode}_report.csv`; writer = createObjectCsvWriter({ path: file, header: [ {id: 'domain', title: 'WEBSHOP_DOMAIN'}, {id: 'product_name', title: 'PRODUCT_NAME'}, {id: 'product_url', title: 'PRODUCT_URL'}, {id: 'in_stock', title: 'IN_STOCK'}, {id: 'blog_url', title: 'BLOG_URL'} ], encoding: "utf8", recordDelimiter: "\r\n", fieldDelimiter: ",", alwaysQuote: true, }); console.log(`Initialized csv writer.`); console.log(`Initialization done.`); console.log(`------------------------------------- \n`); for (const url of urls) { try { await crawlUrl(url); } catch (error) { continue; } } await browser.close(); console.log(`Finished crawling all urls. Saved report to ${file}`); } catch (error) { console.error(`A error occurred!`, error); } } async function crawlUrl(url: string) { try { console.log(`Crawling url: ${url}`); // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Close page await page.close(); // Load html in cheerio object const $ = cheerio.load(html); const products: Array = []; if ($(".row-products").length >= 1) { console.log(`- Parsed ${$(".row-products").length} content egg row type products.`); $(".row-products .cegg-list-logo-title a").each((index, element) => { const productUrl = $(element).attr("href"); const name = $(element).html().trim(); products.push({ domain: "", product_name: name, product_url: productUrl, blog_url: url, in_stock: false, }); }); } if ($("div.egg-container.egg-item").length >= 1) { console.log(`- Parsed ${$("div.egg-container.egg-item").length} content egg large type products.`); $("div.egg-container.egg-item").each((index, element) => { const productUrl = $(element).find(".cegg-btn-row a").first().attr("href"); const name = $(element).find("h2").first().html().trim(); products.push({ domain: "", product_name: name, product_url: productUrl, blog_url: url, in_stock: false, }); }); } console.log("- Checking product stocks..."); for (const index in products) { try { const status = await crawlProductStock(products[index].product_url); products[index].domain = status[0]; products[index].product_url = status[1]; products[index].in_stock = status[2]; if (products[index].in_stock) { console.log(` [IN STOCK] ${products[index].product_name} - ${products[index].domain}`); } else { console.log(` [OUT OF STOCK] ${products[index].product_name} - ${products[index].domain}`); } // Write to csv await writer.writeRecords([{ domain: products[index].domain, product_name: products[index].product_name, product_url: products[index].product_url, in_stock: products[index].in_stock, blog_url: products[index].blog_url, }]); } catch (error) { console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error); continue; } } console.log(""); console.log(""); } catch (error) { console.log("-- Error while trying to crawl page! Skipping..."); } } async function crawlProductStock(url: string): Promise<[string, string, boolean]> { try { // Open new page and goto url const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // Get html from page const html = await page.evaluate(() => document.body.innerHTML); // Load html in cheerio object const $ = cheerio.load(html); // Get domain from url const domain = getDomain(page.url()); // Close page await page.close(); switch (domain) { case "bol.com": return [domain, page.url(), await LDJsonParser.check(html)]; case "petsplace.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "dobey.nl": return [domain, page.url(), await Dobey.check(html)]; case "brekz.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "hondenbed.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "petsonline.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "coolblue.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "bcc.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "azerty.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "cameranu.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "ep.nl": return [domain, page.url(), await EP.check(html)]; case "alternate.nl": return [domain, page.url(), await LDJsonParser.check(html)]; case "macrovet.nl": return [domain, page.url(), await Macrovet.check(html)]; case "ezydog.nl": return [domain, page.url(), await LDJsonParser.check(html)]; default: console.error(`-- ${domain} is not an supported website! Cannot check stock!`); return [domain, page.url(), false]; } } catch (error) { console.error(error); console.log("-- Error while trying to crawl page! Skipping..."); } }