2023-06-21 16:48:31 +02:00

281 lines
9.4 KiB
TypeScript

import jsonfile from "jsonfile";
import cheerio from "cheerio";
import fs from "fs";
import { getDomain } from "tldts";
import { Browser, launch } from "puppeteer";
import axios from "axios";
import { Dobey, EP, LDJsonParser, Macrovet } from "./modules/websites";
import { createObjectCsvWriter } from "csv-writer";
import { CsvWriter } from "csv-writer/src/lib/csv-writer";
import { ObjectMap } from "csv-writer/src/lib/lang/object";
interface IProduct {
domain: string;
product_name: string;
product_url: string;
in_stock: boolean;
blog_url: string;
};
// Globals
let browser: Browser = null;
let sitemapUrl: string = "";
let writer: CsvWriter<ObjectMap<any>> = null;
// debugStart();
start();
async function debugStart() {
browser = await launch({ headless: false });
console.log(await crawlProductStock("https://www.hondenbed.nl/hondenkussen-taupe-bruin.html?utm_medium=affiliate&utm_source=tradetracker"));
}
async function start() {
try {
console.log(`---- ContentEggStockCrawler ----`);
const urls: Array<string> = [];
const sitemapUrls: Array<string> = [];
// Get sitemap url from process params
let params = process.argv.slice(2);
sitemapUrl = params[0];
// Crawl Sitemap
console.log(`Crawling input sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl);
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
$('loc').each(function() {
const url = $(this).text();
if (url.endsWith(".xml")) {
if (!sitemapUrls.includes(url)) {
console.log("Found sitemap url: " + url);
sitemapUrls.push(url);
}
} else {
if (!urls.includes(url)) {
urls.push(url);
}
}
});
for (const sitemapUrl of sitemapUrls) {
console.log(`Crawling found sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl);
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
$('loc').each(function() {
const url = $(this).text();
if (!urls.includes(url)) {
urls.push(url);
}
});
}
console.log(`Crawled ${urls.length} urls from sitemaps.`);
browser = await launch({ headless: true });
console.log(`Initialized puppeteer browser.`);
const today: Date = new Date();
const datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
const domain: string = getDomain(sitemapUrl);
const file = `${domain}_${datecode}_report.csv`;
writer = createObjectCsvWriter({
path: file,
header: [
{id: 'domain', title: 'WEBSHOP_DOMAIN'},
{id: 'product_name', title: 'PRODUCT_NAME'},
{id: 'product_url', title: 'PRODUCT_URL'},
{id: 'in_stock', title: 'IN_STOCK'},
{id: 'blog_url', title: 'BLOG_URL'}
],
encoding: "utf8",
recordDelimiter: "\r\n",
fieldDelimiter: ",",
alwaysQuote: true,
});
console.log(`Initialized csv writer.`);
console.log(`Initialization done.`);
console.log(`------------------------------------- \n`);
for (const url of urls) {
try {
await crawlUrl(url);
} catch (error) {
continue;
}
}
await browser.close();
console.log(`Finished crawling all urls. Saved report to ${file}`);
} catch (error) {
console.error(`A error occurred!`, error);
}
}
async function crawlUrl(url: string) {
try {
console.log(`Crawling url: ${url}`);
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Close page
await page.close();
// Load html in cheerio object
const $ = cheerio.load(html);
const products: Array<IProduct> = [];
if ($(".row-products").length >= 1) {
console.log(`- Parsed ${$(".row-products").length} content egg row type products.`);
$(".row-products .cegg-list-logo-title a").each((index, element) => {
const productUrl = $(element).attr("href");
const name = $(element).html().trim();
products.push({
domain: "",
product_name: name,
product_url: productUrl,
blog_url: url,
in_stock: false,
});
});
}
if ($("div.egg-container.egg-item").length >= 1) {
console.log(`- Parsed ${$("div.egg-container.egg-item").length} content egg large type products.`);
$("div.egg-container.egg-item").each((index, element) => {
const productUrl = $(element).find(".cegg-btn-row a").first().attr("href");
const name = $(element).find("h2").first().html().trim();
products.push({
domain: "",
product_name: name,
product_url: productUrl,
blog_url: url,
in_stock: false,
});
});
}
console.log("- Checking product stocks...");
for (const index in products) {
try {
const status = await crawlProductStock(products[index].product_url);
products[index].domain = status[0];
products[index].product_url = status[1];
products[index].in_stock = status[2];
if (products[index].in_stock) {
console.log(` [IN STOCK] ${products[index].product_name} - ${products[index].domain}`);
} else {
console.log(` [OUT OF STOCK] ${products[index].product_name} - ${products[index].domain}`);
}
// Write to csv
await writer.writeRecords([{
domain: products[index].domain,
product_name: products[index].product_name,
product_url: products[index].product_url,
in_stock: products[index].in_stock,
blog_url: products[index].blog_url,
}]);
} catch (error) {
console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error);
continue;
}
}
console.log("");
console.log("");
} catch (error) {
console.log("-- Error while trying to crawl page! Skipping...");
}
}
async function crawlProductStock(url: string): Promise<[string, string, boolean]> {
try {
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Load html in cheerio object
const $ = cheerio.load(html);
// Get domain from url
const domain = getDomain(page.url());
// Close page
await page.close();
switch (domain) {
case "bol.com":
return [domain, page.url(), await LDJsonParser.check(html)];
case "petsplace.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "dobey.nl":
return [domain, page.url(), await Dobey.check(html)];
case "brekz.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "hondenbed.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "petsonline.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "coolblue.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "bcc.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "azerty.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "cameranu.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "ep.nl":
return [domain, page.url(), await EP.check(html)];
case "alternate.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "macrovet.nl":
return [domain, page.url(), await Macrovet.check(html)];
case "ezydog.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
default:
console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
return [domain, page.url(), false];
}
} catch (error) {
console.error(error);
console.log("-- Error while trying to crawl page! Skipping...");
}
}