345 lines
11 KiB
TypeScript
345 lines
11 KiB
TypeScript
import puppeteer, { product } from "puppeteer";
|
|
import jsonfile from "jsonfile";
|
|
import cheerio from "cheerio";
|
|
import fs from "fs-extra";
|
|
import { getDomain } from "tldts";
|
|
import got from "got";
|
|
|
|
// Import website modules
|
|
import {
|
|
FitnessKoerier,
|
|
FitnessApparaat,
|
|
BolCom,
|
|
BeterSport,
|
|
FitnessGeest,
|
|
BodyAndFit,
|
|
PerfectBody,
|
|
Alternate,
|
|
Expert,
|
|
Silvergear,
|
|
AlsaNature,
|
|
Dobey,
|
|
EzyDog,
|
|
K9Shop,
|
|
OnlineHondenSpeciaalZaak,
|
|
MacroVet,
|
|
PetDuka,
|
|
ZooEnzo,
|
|
PetsOnline,
|
|
PetsPlace,
|
|
ThePetEmpire,
|
|
Conrad,
|
|
HuisdierExpress,
|
|
HondenBed,
|
|
Coolblue,
|
|
WifiMedia,
|
|
Ep
|
|
} from "./modules/websites";
|
|
|
|
// Globals
|
|
let browser: puppeteer.Browser = null;
|
|
let sitemapUrl: string = "";
|
|
let allProducts: Array<any> = [];
|
|
let allBlogUrls: Array<any> = [];
|
|
|
|
// debugStart();
|
|
start();
|
|
|
|
async function debugStart() {
|
|
browser = await puppeteer.launch({ headless: false });
|
|
console.log(await crawlProductStock("https://www.ep.nl/products/sony-kd-l32we610-hd-led-tv/27658/"));
|
|
}
|
|
|
|
async function start() {
|
|
try {
|
|
console.log(`---- ContentEggStockCrawler ----`);
|
|
const urls: Array<string> = [];
|
|
|
|
// Get sitemap url from process params
|
|
let params = process.argv.slice(2);
|
|
sitemapUrl = params[0];
|
|
|
|
// Crawl Sitemap
|
|
const sitemap = await got(sitemapUrl);
|
|
const $ = cheerio.load(sitemap.body, { xmlMode: true })
|
|
|
|
$('loc').each(function() {
|
|
const url = $(this).text()
|
|
|
|
if (!urls.includes(url)) {
|
|
urls.push(url)
|
|
}
|
|
});
|
|
|
|
console.log(`Loaded ${urls.length} urls from sitemap! (${params[0]})`);
|
|
|
|
console.log(`Starting crawler browser..`);
|
|
browser = await puppeteer.launch({ headless: true });
|
|
|
|
console.log(`Initialization done!`);
|
|
console.log(`------------------------------------- \n`);
|
|
|
|
for (let url of urls) {
|
|
try {
|
|
allBlogUrls.push(url);
|
|
await crawlBlogPage(url);
|
|
} catch (error) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
await browser.close();
|
|
await generateReport();
|
|
} catch (error) {
|
|
console.error(`An Error Occured!`, error);
|
|
}
|
|
}
|
|
|
|
async function crawlBlogPage(url: string) {
|
|
try {
|
|
console.log(`Crawling blog page: ${url}`);
|
|
|
|
// Open new page and goto url
|
|
const page = await browser.newPage();
|
|
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
|
|
// Get html from page
|
|
const html = await page.evaluate(() => document.body.innerHTML);
|
|
|
|
// Close page
|
|
await page.close();
|
|
|
|
// Load html in cheerio object
|
|
const $ = cheerio.load(html);
|
|
|
|
let products: Array<any> = [];
|
|
|
|
if ($(".row-products").length >= 1) {
|
|
console.log(`Detected ${$(".row-products").length} content egg row type products!`);
|
|
|
|
$(".row-products .cegg-list-logo-title a").each((index, element) => {
|
|
let productUrl = $(element).attr("href");
|
|
let name = $(element).html().trim();
|
|
|
|
products.push({
|
|
name: name,
|
|
blogUrl: url,
|
|
url: productUrl,
|
|
domain: "",
|
|
inStock: false,
|
|
});
|
|
});
|
|
}
|
|
|
|
if ($("div.egg-container.egg-item").length >= 1) {
|
|
console.log(`Detected ${$(".div.egg-container.egg-item").length} content egg large type products!`);
|
|
|
|
$("div.egg-container.egg-item").each((index, element) => {
|
|
let productUrl = $(element).find(".cegg-btn-row a").first().attr("href");
|
|
let name = $(element).find("h2").first().html().trim();
|
|
|
|
products.push({
|
|
name: name,
|
|
blogUrl: url,
|
|
url: productUrl,
|
|
domain: "",
|
|
inStock: false,
|
|
});
|
|
});
|
|
}
|
|
|
|
console.log("Checking product stocks...");
|
|
|
|
for (let index in products) {
|
|
try {
|
|
let status = await crawlProductStock(products[index].url);
|
|
|
|
products[index].domain = status[0];
|
|
products[index].url = status[1];
|
|
products[index].inStock = status[2];
|
|
|
|
if (products[index].inStock) {
|
|
console.log(`[IN STOCK] ${products[index].name} - ${products[index].domain}`);
|
|
} else {
|
|
console.log(`[OUT OF STOCK] ${products[index].name} - ${products[index].domain}`);
|
|
}
|
|
|
|
allProducts.push(products[index]);
|
|
} catch (error) {
|
|
console.log("Skipping product...");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
console.log("");
|
|
console.log("");
|
|
|
|
} catch (error) {
|
|
console.log("Error while trying to crawl page! Skipping...");
|
|
}
|
|
}
|
|
|
|
async function crawlProductStock(url: string) {
|
|
try {
|
|
// Open new page and goto url
|
|
const page = await browser.newPage();
|
|
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
|
|
// Get html from page
|
|
const html = await page.evaluate(() => document.body.innerHTML);
|
|
|
|
// Load html in cheerio object
|
|
const $ = cheerio.load(html);
|
|
|
|
// Get domain from url
|
|
const domain = getDomain(page.url());
|
|
|
|
// Close page
|
|
await page.close();
|
|
|
|
switch (domain) {
|
|
case "bol.com":
|
|
return [domain, page.url(), await BolCom.check(html)];
|
|
|
|
case "coolblue.nl":
|
|
return [domain, page.url(), await Coolblue.check(html)];
|
|
|
|
case "fitnesskoerier.nl":
|
|
return [domain, page.url(), await FitnessKoerier.check(html)];
|
|
|
|
case "fitnessapparaat.nl":
|
|
return [domain, page.url(), await FitnessApparaat.check(html)];
|
|
|
|
case "betersport.nl":
|
|
return [domain, page.url(), await BeterSport.check(html)];
|
|
|
|
case "fitness-geest.nl":
|
|
return [domain, page.url(), await FitnessGeest.check(html)];
|
|
|
|
case "bodyandfit.com":
|
|
return [domain, page.url(), await BodyAndFit.check(html)];
|
|
|
|
case "gorillasports.nl":
|
|
return [domain, page.url(), await BodyAndFit.check(html)];
|
|
|
|
case "perfectbody.nl":
|
|
return [domain, page.url(), await PerfectBody.check(html)];
|
|
|
|
case "alternate.nl":
|
|
return [domain, page.url(), await Alternate.check(html)];
|
|
|
|
case "expert.nl":
|
|
return [domain, page.url(), await Expert.check(html)];
|
|
|
|
case "bcc.nl":
|
|
return [domain, page.url(), await Expert.check(html)];
|
|
|
|
case "silvergear.eu":
|
|
return [domain, page.url(), await Silvergear.check(html)];
|
|
|
|
case "conrad.com":
|
|
return [domain, page.url(), await Conrad.check(html)];
|
|
|
|
case "conrad.nl":
|
|
return [domain, page.url(), await Conrad.check(html)];
|
|
|
|
case "alsa-nature.nl":
|
|
return [domain, page.url(), await AlsaNature.check(html)];
|
|
|
|
case "dobey.nl":
|
|
return [domain, page.url(), await Dobey.check(html)];
|
|
|
|
case "ezydog.nl":
|
|
return [domain, page.url(), await EzyDog.check(html)];
|
|
|
|
case "k9shop.nl":
|
|
return [domain, page.url(), await K9Shop.check(html)];
|
|
|
|
case "onlinehondenspeciaalzaak.nl":
|
|
return [domain, page.url(), await OnlineHondenSpeciaalZaak.check(html)];
|
|
|
|
case "macrovet.nl":
|
|
return [domain, page.url(), await MacroVet.check(html)];
|
|
|
|
case "petduka.nl":
|
|
return [domain, page.url(), await PetDuka.check(html)];
|
|
|
|
case "petsonline.nl":
|
|
return [domain, page.url(), await PetsOnline.check(html)];
|
|
|
|
case "petsplace.nl":
|
|
return [domain, page.url(), await PetsPlace.check(html)];
|
|
|
|
case "zoo-enzo.nl":
|
|
return [domain, page.url(), await ZooEnzo.check(html)];
|
|
|
|
case "thepetempire.com":
|
|
return [domain, page.url(), await ThePetEmpire.check(html)];
|
|
|
|
case "huisdierexpress.nl":
|
|
return [domain, page.url(), await HuisdierExpress.check(html)];
|
|
|
|
case "hondenbed.nl":
|
|
return [domain, page.url(), await HondenBed.check(html)];
|
|
|
|
case "wifimedia.eu":
|
|
return [domain, page.url(), await WifiMedia.check(html)];
|
|
|
|
case "ep.nl":
|
|
return [domain, page.url(), await Ep.check(html)];
|
|
|
|
default:
|
|
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
|
|
return [domain, page.url(), false];
|
|
}
|
|
} catch (error) {
|
|
console.error(error);
|
|
console.log("Error while trying to crawl page! Skipping...");
|
|
}
|
|
}
|
|
|
|
async function generateReport() {
|
|
console.log("Generating report...");
|
|
|
|
let today: Date = new Date();
|
|
let datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
|
|
let domain: string = getDomain(sitemapUrl);
|
|
let file = `${domain}_${datecode}_report.txt`;
|
|
|
|
let totalProducts = 0;
|
|
let totalOutStock = 0;
|
|
for (let product of allProducts) {
|
|
totalProducts++;
|
|
|
|
if (product.inStock == false) {
|
|
totalOutStock++;
|
|
}
|
|
}
|
|
|
|
await fs.appendFile(file, `----------------------------------------\n`);
|
|
await fs.appendFile(file, `Content Egg Product Stock Crawler Report\n`);
|
|
await fs.appendFile(file, `Total Blog Urls: ${allBlogUrls.length}\n`);
|
|
await fs.appendFile(file, `Total Products: ${totalProducts}\n`);
|
|
await fs.appendFile(file, `Total Out of Stock: ${totalOutStock}\n`);
|
|
await fs.appendFile(file, `----------------------------------------\n`);
|
|
await fs.appendFile(file, ``);
|
|
|
|
let lastBlogUrl = "";
|
|
for (let product of allProducts) {
|
|
if (product.blogUrl != lastBlogUrl) {
|
|
await fs.appendFile(file, `\n\n`);
|
|
await fs.appendFile(file, `${product.blogUrl}\n`);
|
|
await fs.appendFile(file, `----------------------------------\n`);
|
|
}
|
|
|
|
if (product.inStock == false) {
|
|
await fs.appendFile(file, `${product.domain} - ${product.name} - ${product.url}\n`);
|
|
}
|
|
|
|
lastBlogUrl = product.blogUrl;
|
|
}
|
|
|
|
console.log("Report generated!");
|
|
}
|
|
|