2020-11-21 12:07:10 +01:00

250 lines
7.7 KiB
TypeScript

import puppeteer, { product } from "puppeteer";
import jsonfile from "jsonfile";
import cheerio from "cheerio";
import fs from "fs-extra";
import { getDomain } from "tldts";
// Import website modules
import {
FitnessKoerier,
FitnessApparaat,
BolCom,
BeterSport,
FitnessGeest,
BodyAndFit,
PerfectBody,
Alternate,
Expert,
Silvergear
} from "./modules/websites";
start();
// Globals
let browser: puppeteer.Browser = null;
let allProducts: Array<any> = [];
let allBlogUrls: Array<any> = [];
let urlsFileName = "";
async function start() {
try {
console.log(`---- ContentEggStockCrawler ----`);
console.log(`Starting crawler browser..`);
browser = await puppeteer.launch({ headless: true });
let params = process.argv.slice(2);
urlsFileName = params[0].replace(/\.[^/.]+$/, "");
const urls = await jsonfile.readFile(`./${params[0]}`);
console.log(`Loaded ${urls.length} urls from file! (${params[0]})`);
console.log(`Initialization done!`);
console.log(`------------------------------------- \n`);
for (let url of urls) {
try {
allBlogUrls.push(url);
await crawlBlogPage(url);
} catch (error) {
continue;
}
}
await browser.close();
await generateReport();
} catch (error) {
console.error(`An Error Occured!`, error);
}
}
async function crawlBlogPage(url: string) {
try {
console.log(`Crawling blog page: ${url}`);
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Close page
await page.close();
// Load html in cheerio object
const $ = cheerio.load(html);
let products: Array<any> = [];
if ($(".row-products").length >= 1) {
console.log(`Detected ${$(".row-products").length} content egg row type products!`);
$(".row-products .cegg-list-logo-title a").each((index, element) => {
let productUrl = $(element).attr("href");
let name = $(element).html().trim();
products.push({
name: name,
blogUrl: url,
url: productUrl,
domain: "",
inStock: false,
});
});
}
if ($("div.egg-container.egg-item").length >= 1) {
console.log(`Detected ${$(".row-products").length} content egg large type products!`);
$("div.egg-container.egg-item").each((index, element) => {
let productUrl = $(element).find(".cegg-btn-row a").first().attr("href");
let name = $(element).find("h2").first().html().trim();
products.push({
name: name,
blogUrl: url,
url: productUrl,
domain: "",
inStock: false,
});
});
}
console.log("Checking product stocks...");
for (let index in products) {
try {
let status = await crawlProductStock(products[index].url);
products[index].domain = status[0];
products[index].url = status[1];
products[index].inStock = status[2];
if (products[index].inStock) {
console.log(`${products[index].name} - ${products[index].domain}`);
} else {
console.log(`${products[index].name} - ${products[index].domain}`);
}
allProducts.push(products[index]);
} catch (error) {
console.log("Skipping product...");
continue;
}
}
console.log("");
console.log("");
} catch (error) {
console.log("Error while trying to crawl page! Skipping...");
}
}
async function crawlProductStock(url: string) {
try {
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Load html in cheerio object
const $ = cheerio.load(html);
// Get domain from url
const domain = getDomain(page.url());
// Close page
await page.close();
switch (domain) {
case "bol.com":
return [domain, page.url(), await BolCom.check(html)];
case "fitnesskoerier.nl":
return [domain, page.url(), await FitnessKoerier.check(html)];
case "fitnessapparaat.nl":
return [domain, page.url(), await FitnessApparaat.check(html)];
case "betersport.nl":
return [domain, page.url(), await BeterSport.check(html)];
case "fitness-geest.nl":
return [domain, page.url(), await FitnessGeest.check(html)];
case "bodyandfit.com":
return [domain, page.url(), await BodyAndFit.check(html)];
case "gorillasports.nl":
return [domain, page.url(), await BodyAndFit.check(html)];
case "perfectbody.nl":
return [domain, page.url(), await PerfectBody.check(html)];
case "alternate.nl":
return [domain, page.url(), await Alternate.check(html)];
case "expert.nl":
return [domain, page.url(), await Expert.check(html)];
case "bcc.nl":
return [domain, page.url(), await Expert.check(html)];
case "silvergear.eu":
return [domain, page.url(), await Silvergear.check(html)];
default:
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
return [domain, page.url(), false];
}
} catch (error) {
console.error(error);
console.log("Error while trying to crawl page! Skipping...");
}
}
async function generateReport() {
console.log("Generating report...");
let file = `${urlsFileName}-report.txt`;
let totalProducts = 0;
let totalOutStock = 0;
for (let product of allProducts) {
totalProducts++;
if (product.inStock == false) {
totalOutStock++;
}
}
await fs.appendFile(file, `----------------------------------------\n`);
await fs.appendFile(file, `Content Egg Product Stock Crawler Report\n`);
await fs.appendFile(file, `Total Blog Urls: ${allBlogUrls.length}\n`);
await fs.appendFile(file, `Total Products: ${totalProducts}\n`);
await fs.appendFile(file, `Total Out of Stock: ${totalOutStock}\n`);
await fs.appendFile(file, `----------------------------------------\n`);
await fs.appendFile(file, ``);
let lastBlogUrl = "";
for (let product of allProducts) {
if (product.blogUrl != lastBlogUrl) {
await fs.appendFile(file, `\n\n`);
await fs.appendFile(file, `${product.blogUrl}\n`);
await fs.appendFile(file, `----------------------------------\n`);
}
if (product.inStock == false) {
await fs.appendFile(file, `${product.domain} - ${product.name} - ${product.url}\n`);
}
lastBlogUrl = product.blogUrl;
}
console.log("Report generated!");
}