306 lines
11 KiB
TypeScript
306 lines
11 KiB
TypeScript
|
|
import jsonfile from "jsonfile";
|
|
import cheerio, { CheerioAPI } from "cheerio";
|
|
import fs from "fs";
|
|
import { getDomain } from "tldts";
|
|
import { Browser, launch } from "puppeteer";
|
|
import axios from "axios";
|
|
import { CoolBlue, Dectdirect, Dobey, EP, LDJsonParser, Macrovet, Mediamarkt } from "./modules/websites";
|
|
import { createObjectCsvWriter } from "csv-writer";
|
|
import { CsvWriter } from "csv-writer/src/lib/csv-writer";
|
|
import { ObjectMap } from "csv-writer/src/lib/lang/object";
|
|
|
|
interface IProduct {
|
|
domain: string;
|
|
product_name: string;
|
|
product_url: string;
|
|
in_stock: boolean;
|
|
blog_url: string;
|
|
};
|
|
|
|
// Globals
|
|
let browser: Browser = null;
|
|
let sitemapUrl: string = "";
|
|
let writer: CsvWriter<ObjectMap<any>> = null;
|
|
|
|
// debugStart();
|
|
start();
|
|
|
|
async function debugStart() {
|
|
browser = await launch({ headless: false });
|
|
console.log(await crawlProductStock("https://www.dectdirect.nl/nl/unifi-switch-lite-8-poe-115502613.html"));
|
|
// console.log(await crawlProductStock("https://www.mediamarkt.nl/nl/product/_apple-iphone-14-128gb-midnight-1738479.html"));
|
|
// console.log(await crawlProductStock("https://www.coolblue.nl/product/923036/hp-deskjet-2720e-all-in-one.html?clickref=1101lwW9ebAE&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1101lwW9ebAE&utm_term=1100leWsm&ref=293530&PHGref=1101lwW9ebAE&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1101lwW9ebAE"));
|
|
// console.log(await crawlProductStock("https://www.coolblue.nl/product/882996/canon-pixma-ts-3450-zwart.html?clickref=1011lwWDBFEL&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1011lwWDBFEL&utm_term=1100leWsm&ref=293530&PHGref=1011lwWDBFEL&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1011lwWDBFEL"));
|
|
}
|
|
|
|
async function start() {
|
|
try {
|
|
console.log(`---- ContentEggStockCrawler ----`);
|
|
const urls: Array<string> = [];
|
|
const sitemapUrls: Array<string> = [];
|
|
|
|
// Get sitemap url from process params
|
|
let params = process.argv.slice(2);
|
|
sitemapUrl = params[0];
|
|
|
|
let $: CheerioAPI = null;
|
|
try {
|
|
// Crawl Sitemap
|
|
console.log(`Crawling input sitemap: ${sitemapUrl}`);
|
|
const sitemapRequest = await axios.get(sitemapUrl);
|
|
$ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
|
} catch (error) {
|
|
console.log("[ERROR] Error occurred crawling main sitemap.")
|
|
process.exit(0);
|
|
}
|
|
|
|
$('loc').each(function() {
|
|
const url = $(this).text();
|
|
|
|
if (url.endsWith(".xml")) {
|
|
if (!sitemapUrls.includes(url)) {
|
|
console.log("Found sitemap url: " + url);
|
|
sitemapUrls.push(url);
|
|
}
|
|
} else {
|
|
if (!urls.includes(url)) {
|
|
urls.push(url);
|
|
}
|
|
}
|
|
});
|
|
|
|
|
|
for (const sitemapUrl of sitemapUrls) {
|
|
console.log(`Crawling sitemap: ${sitemapUrl}`);
|
|
const sitemapRequest = await axios.get(sitemapUrl);
|
|
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
|
|
|
$('loc').each(function() {
|
|
const url = $(this).text();
|
|
|
|
if (!urls.includes(url)) {
|
|
urls.push(url);
|
|
}
|
|
});
|
|
}
|
|
|
|
console.log(`Found ${urls.length} urls from sitemaps.`);
|
|
|
|
browser = await launch({ headless: true });
|
|
console.log(`Initialized puppeteer browser.`);
|
|
|
|
const today: Date = new Date();
|
|
const datecode: string = `${today.getFullYear()}-${today.getMonth() + 1}-${today.getDate()}`;
|
|
const domain: string = getDomain(sitemapUrl);
|
|
const file = `${domain}_${datecode}_report.csv`;
|
|
writer = createObjectCsvWriter({
|
|
path: file,
|
|
header: [
|
|
{id: 'domain', title: 'WEBSHOP_DOMAIN'},
|
|
{id: 'product_name', title: 'PRODUCT_NAME'},
|
|
{id: 'product_url', title: 'PRODUCT_URL'},
|
|
{id: 'in_stock', title: 'IN_STOCK'},
|
|
{id: 'blog_url', title: 'BLOG_URL'}
|
|
],
|
|
encoding: "utf8",
|
|
recordDelimiter: "\r\n",
|
|
fieldDelimiter: ",",
|
|
alwaysQuote: true,
|
|
});
|
|
console.log(`Initialized csv writer.`);
|
|
|
|
console.log(`Initialization done.`);
|
|
console.log(`------------------------------------- \n`);
|
|
|
|
for (const url of urls) {
|
|
try {
|
|
await crawlUrl(url);
|
|
} catch (error) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
await browser.close();
|
|
console.log(`Finished crawling all urls. Saved report to ${file}`);
|
|
} catch (error) {
|
|
console.error(`A error occurred!`, error);
|
|
}
|
|
}
|
|
|
|
async function crawlUrl(url: string) {
|
|
try {
|
|
console.log(`Crawling url: ${url}`);
|
|
|
|
// Open new page and goto url
|
|
const page = await browser.newPage();
|
|
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
|
|
// Get html from page
|
|
const html = await page.evaluate(() => document.documentElement.outerHTML);
|
|
|
|
// Close page
|
|
await page.close();
|
|
|
|
// Load html in cheerio object
|
|
const $ = cheerio.load(html);
|
|
|
|
const products: Array<IProduct> = [];
|
|
|
|
if ($(".row-products").length >= 1) {
|
|
console.log(`- Parsed ${$(".row-products").length} content egg row type products.`);
|
|
|
|
$(".row-products .cegg-list-logo-title a").each((index, element) => {
|
|
const productUrl = $(element).attr("href");
|
|
const name = $(element).html().trim();
|
|
|
|
products.push({
|
|
domain: "",
|
|
product_name: name,
|
|
product_url: productUrl,
|
|
blog_url: url,
|
|
in_stock: false,
|
|
});
|
|
});
|
|
}
|
|
|
|
if ($("div.egg-container.egg-item").length >= 1) {
|
|
console.log(`- Parsed ${$("div.egg-container.egg-item").length} content egg large type products.`);
|
|
|
|
$("div.egg-container.egg-item").each((index, element) => {
|
|
const productUrl = $(element).find(".cegg-btn-row a").first().attr("href");
|
|
const name = $(element).find("h2").first().html().trim();
|
|
|
|
products.push({
|
|
domain: "",
|
|
product_name: name,
|
|
product_url: productUrl,
|
|
blog_url: url,
|
|
in_stock: false,
|
|
});
|
|
});
|
|
}
|
|
|
|
console.log("- Checking product stocks...");
|
|
|
|
for (const index in products) {
|
|
try {
|
|
const status = await crawlProductStock(products[index].product_url);
|
|
|
|
products[index].domain = status[0];
|
|
products[index].product_url = status[1];
|
|
products[index].in_stock = status[2];
|
|
|
|
if (products[index].in_stock) {
|
|
console.log(` [IN STOCK] ${products[index].product_name} - ${products[index].domain}`);
|
|
} else {
|
|
console.log(` [OUT OF STOCK] ${products[index].product_name} - ${products[index].domain}`);
|
|
}
|
|
|
|
// Write to csv
|
|
await writer.writeRecords([{
|
|
domain: products[index].domain,
|
|
product_name: products[index].product_name,
|
|
product_url: products[index].product_url,
|
|
in_stock: products[index].in_stock,
|
|
blog_url: products[index].blog_url,
|
|
}]);
|
|
|
|
await wait(1000);
|
|
} catch (error) {
|
|
console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
console.log("");
|
|
console.log("");
|
|
} catch (error) {
|
|
console.log("-- Error while trying to crawl page! Skipping...");
|
|
}
|
|
}
|
|
|
|
async function crawlProductStock(url: string): Promise<[string, string, boolean]> {
|
|
try {
|
|
// Open new page and goto url
|
|
const page = await browser.newPage();
|
|
page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
|
|
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
|
|
// Get html from page
|
|
const html = await page.evaluate(() => document.documentElement.outerHTML);
|
|
|
|
// Get domain from url
|
|
const domain = getDomain(page.url());
|
|
|
|
// Close page
|
|
await page.close();
|
|
|
|
switch (domain) {
|
|
case "bol.com":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "petsplace.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "dobey.nl":
|
|
return [domain, page.url(), await Dobey.check(html)];
|
|
|
|
case "brekz.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "hondenbed.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "petsonline.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "coolblue.nl":
|
|
return [domain, page.url(), await CoolBlue.check(html)];
|
|
|
|
case "bcc.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "azerty.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "cameranu.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "ep.nl":
|
|
return [domain, page.url(), await EP.check(html)];
|
|
|
|
case "alternate.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "macrovet.nl":
|
|
return [domain, page.url(), await Macrovet.check(html)];
|
|
|
|
case "ezydog.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "mediamarkt.nl":
|
|
return [domain, page.url(), await Mediamarkt.check(html)];
|
|
|
|
case "dectdirect.nl":
|
|
return [domain, page.url(), await Dectdirect.check(html)];
|
|
|
|
case "expert.nl":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
case "maxiaxi.com":
|
|
return [domain, page.url(), await LDJsonParser.check(html)];
|
|
|
|
default:
|
|
console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
|
|
return [domain, page.url(), false];
|
|
}
|
|
} catch (error) {
|
|
console.error(error);
|
|
console.log("-- Error while trying to crawl page! Skipping...");
|
|
}
|
|
}
|
|
|
|
function wait(ms: number) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
} |