refactor / cleanup

This commit is contained in:
2023-06-20 15:37:58 +02:00
parent 46bbc222a4
commit b8f77cbc58
8 changed files with 2358 additions and 2292 deletions

View File

@@ -1,56 +1,31 @@
import puppeteer, { product } from "puppeteer";
import jsonfile from "jsonfile";
import cheerio from "cheerio";
import fs from "fs-extra";
import fs from "fs";
import { getDomain } from "tldts";
import got from "got";
import { Browser, launch } from "puppeteer";
import axios from "axios";
import { BolCom } from "./modules/websites";
// Import website modules
import {
FitnessKoerier,
FitnessApparaat,
BolCom,
BeterSport,
FitnessGeest,
BodyAndFit,
PerfectBody,
Alternate,
Expert,
Silvergear,
AlsaNature,
Dobey,
EzyDog,
K9Shop,
OnlineHondenSpeciaalZaak,
MacroVet,
PetDuka,
ZooEnzo,
PetsOnline,
PetsPlace,
ThePetEmpire,
Conrad,
HuisdierExpress,
HondenBed,
Coolblue,
WifiMedia,
Ep,
Kinq,
Maxiaxi,
FotoDeVakman,
Kabels
} from "./modules/websites";
interface IProduct {
name: string;
blogUrl: string;
url: string;
domain: string;
inStock: boolean;
};
// Globals
let browser: puppeteer.Browser = null;
let browser: Browser = null;
let sitemapUrl: string = "";
let allProducts: Array<any> = [];
let allBlogUrls: Array<any> = [];
let allProducts: Array<IProduct> = [];
let allBlogUrls: Array<string> = [];
// debugStart();
start();
async function debugStart() {
browser = await puppeteer.launch({ headless: false });
browser = await launch({ headless: false });
console.log(await crawlProductStock("https://www.kabels.nl/nl_nl/ewent-ew3240-draadloze-multi-connect-muis-600-2400-dpi-zwart-34863350.html"));
}
@@ -58,35 +33,59 @@ async function start() {
try {
console.log(`---- ContentEggStockCrawler ----`);
const urls: Array<string> = [];
const sitemapUrls: Array<string> = [];
// Get sitemap url from process params
let params = process.argv.slice(2);
sitemapUrl = params[0];
// Crawl Sitemap
const sitemap = await got(sitemapUrl);
const $ = cheerio.load(sitemap.body, { xmlMode: true })
console.log(`Crawling input sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl);
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
$('loc').each(function() {
const url = $(this).text()
const url = $(this).text();
if (!urls.includes(url)) {
urls.push(url)
if (url.endsWith(".xml")) {
if (!sitemapUrls.includes(url)) {
console.log("Found sitemap url: " + url);
sitemapUrls.push(url);
}
} else {
if (!urls.includes(url)) {
urls.push(url);
}
}
});
console.log(`Loaded ${urls.length} urls from sitemap! (${params[0]})`);
for (const sitemapUrl of sitemapUrls) {
console.log(`Crawling crawled sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl);
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
$('loc').each(function() {
const url = $(this).text();
if (!urls.includes(url)) {
urls.push(url);
}
});
}
console.log(`Crawled ${urls.length} urls from sitemaps.`);
console.log(`Starting crawler browser..`);
browser = await puppeteer.launch({ headless: true });
browser = await launch({ headless: true });
console.log(`Initialization done!`);
console.log(`Initialization done.`);
console.log(`------------------------------------- \n`);
for (let url of urls) {
try {
allBlogUrls.push(url);
await crawlBlogPage(url);
await crawlUrl(url);
} catch (error) {
continue;
}
@@ -99,9 +98,9 @@ async function start() {
}
}
async function crawlBlogPage(url: string) {
async function crawlUrl(url: string) {
try {
console.log(`Crawling blog page: ${url}`);
console.log(`Crawling url: ${url}`);
// Open new page and goto url
const page = await browser.newPage();
@@ -119,7 +118,7 @@ async function crawlBlogPage(url: string) {
let products: Array<any> = [];
if ($(".row-products").length >= 1) {
console.log(`Detected ${$(".row-products").length} content egg row type products!`);
console.log(`- Detected ${$(".row-products").length} content egg row type products.`);
$(".row-products .cegg-list-logo-title a").each((index, element) => {
let productUrl = $(element).attr("href");
@@ -136,7 +135,7 @@ async function crawlBlogPage(url: string) {
}
if ($("div.egg-container.egg-item").length >= 1) {
console.log(`Detected ${$("div.egg-container.egg-item").length} content egg large type products!`);
console.log(`- Detected ${$("div.egg-container.egg-item").length} content egg large type products.`);
$("div.egg-container.egg-item").each((index, element) => {
let productUrl = $(element).find(".cegg-btn-row a").first().attr("href");
@@ -152,7 +151,7 @@ async function crawlBlogPage(url: string) {
});
}
console.log("Checking product stocks...");
console.log("- Checking product stocks...");
for (let index in products) {
try {
@@ -163,14 +162,14 @@ async function crawlBlogPage(url: string) {
products[index].inStock = status[2];
if (products[index].inStock) {
console.log(`[IN STOCK] ${products[index].name} - ${products[index].domain}`);
console.log(` [IN STOCK] ${products[index].name} - ${products[index].domain}`);
} else {
console.log(`[OUT OF STOCK] ${products[index].name} - ${products[index].domain}`);
console.log(` [OUT OF STOCK] ${products[index].name} - ${products[index].domain}`);
}
allProducts.push(products[index]);
} catch (error) {
console.log("Skipping product...");
console.log("- Skipping product...");
continue;
}
}
@@ -179,7 +178,7 @@ async function crawlBlogPage(url: string) {
console.log("");
} catch (error) {
console.log("Error while trying to crawl page! Skipping...");
console.log("-- Error while trying to crawl page! Skipping...");
}
}
@@ -204,113 +203,14 @@ async function crawlProductStock(url: string) {
switch (domain) {
case "bol.com":
return [domain, page.url(), await BolCom.check(html)];
case "coolblue.nl":
return [domain, page.url(), await Coolblue.check(html)];
case "fitnesskoerier.nl":
return [domain, page.url(), await FitnessKoerier.check(html)];
case "fitnessapparaat.nl":
return [domain, page.url(), false];
case "betersport.nl":
return [domain, page.url(), await BeterSport.check(html)];
case "fitness-geest.nl":
return [domain, page.url(), await FitnessGeest.check(html)];
case "bodyandfit.com":
return [domain, page.url(), await BodyAndFit.check(html)];
case "gorillasports.nl":
return [domain, page.url(), await BodyAndFit.check(html)];
case "perfectbody.nl":
return [domain, page.url(), await PerfectBody.check(html)];
case "alternate.nl":
return [domain, page.url(), await Alternate.check(html)];
case "expert.nl":
return [domain, page.url(), await Expert.check(html)];
case "bcc.nl":
return [domain, page.url(), await Expert.check(html)];
case "silvergear.eu":
return [domain, page.url(), await Silvergear.check(html)];
case "conrad.com":
return [domain, page.url(), await Conrad.check(html)];
case "conrad.nl":
return [domain, page.url(), await Conrad.check(html)];
case "alsa-nature.nl":
return [domain, page.url(), await AlsaNature.check(html)];
case "dobey.nl":
return [domain, page.url(), await Dobey.check(html)];
case "ezydog.nl":
return [domain, page.url(), await EzyDog.check(html)];
case "k9shop.nl":
return [domain, page.url(), await K9Shop.check(html)];
case "onlinehondenspeciaalzaak.nl":
return [domain, page.url(), await OnlineHondenSpeciaalZaak.check(html)];
case "macrovet.nl":
return [domain, page.url(), await MacroVet.check(html)];
case "petduka.nl":
return [domain, page.url(), await PetDuka.check(html)];
case "petsonline.nl":
return [domain, page.url(), await PetsOnline.check(html)];
case "petsplace.nl":
return [domain, page.url(), await PetsPlace.check(html)];
case "zoo-enzo.nl":
return [domain, page.url(), await ZooEnzo.check(html)];
case "thepetempire.com":
return [domain, page.url(), await ThePetEmpire.check(html)];
case "huisdierexpress.nl":
return [domain, page.url(), await HuisdierExpress.check(html)];
case "hondenbed.nl":
return [domain, page.url(), await HondenBed.check(html)];
case "wifimedia.eu":
return [domain, page.url(), await WifiMedia.check(html)];
case "ep.nl":
return [domain, page.url(), await Ep.check(html)];
case "kinq.nl":
return [domain, page.url(), await Kinq.check(html)];
case "maxiaxi.com":
return [domain, page.url(), await Maxiaxi.check(html)];
case "fotodevakman.nl":
return [domain, page.url(), await FotoDeVakman.check(html)];
case "kabels.nl":
return [domain, page.url(), await Kabels.check(html)];
default:
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
return [domain, page.url(), false];
}
} catch (error) {
console.error(error);
console.log("Error while trying to crawl page! Skipping...");
console.log("-- Error while trying to crawl page! Skipping...");
}
}
@@ -332,29 +232,38 @@ async function generateReport() {
}
}
await fs.appendFile(file, `----------------------------------------\n`);
await fs.appendFile(file, `Content Egg Product Stock Crawler Report\n`);
await fs.appendFile(file, `Total Blog Urls: ${allBlogUrls.length}\n`);
await fs.appendFile(file, `Total Products: ${totalProducts}\n`);
await fs.appendFile(file, `Total Out of Stock: ${totalOutStock}\n`);
await fs.appendFile(file, `----------------------------------------\n`);
await fs.appendFile(file, ``);
appendLn(file, `Content Egg Product Stock Crawler Report\n`);
appendLn(file, `Total Blog Urls: ${allBlogUrls.length}\n`);
appendLn(file, `Total Products: ${totalProducts}\n`);
appendLn(file, `Total Out of Stock: ${totalOutStock}\n`);
appendLn(file, ``);
let lastBlogUrl = "";
for (let product of allProducts) {
if (product.blogUrl != lastBlogUrl) {
await fs.appendFile(file, `\n\n`);
await fs.appendFile(file, `${product.blogUrl}\n`);
await fs.appendFile(file, `----------------------------------\n`);
appendLn(file, `\n\n`);
appendLn(file, `${product.blogUrl}\n`);
appendLn(file, `----------------------------------\n`);
}
if (product.inStock == false) {
await fs.appendFile(file, `${product.domain} - ${product.name} - ${product.url}\n`);
appendLn(file, `${product.domain} - ${product.name} - ${product.url}\n`);
}
lastBlogUrl = product.blogUrl;
}
console.log("Report generated!");
console.log("Report generated.");
}
function appendLn(file: string, text: string): Promise<void> {
return new Promise((resolve, reject) => {
fs.appendFile(file, text + "\n", (err) => {
if (err) {
reject(err);
} else {
resolve();
}
});
});
}

View File

@@ -51,438 +51,6 @@ export namespace FitnessApparaat {
}
export namespace BolCom {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('[data-test="outofstock-buy-block"]').length >= 1) {
return false;
} else {
return true;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace BeterSport {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
return false;
if ($('.product-shop .in-stock').first().html() !== null) {
if ($('.product-shop .in-stock').first().html().trim() == "Niet op voorraad, neem contact op voor actuele informatie") {
return false;
}
else if ($('.product-shop .in-stock').first().html().trim().includes("week")) {
return false;
}
else {
return true;
}
} else {
return true;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace FitnessGeest {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product-info-box .in_stock_message').length >= 1) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace BodyAndFit {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product-page .product-cart-controls .btn__primary').first().attr("disabled")) {
return false;
} else {
return true;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace GorillaSports {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product_buy .button_sold_out').length >= 1) {
return false;
} else {
return true;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace PerfectBody {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product.type-product').first().hasClass("instock")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace BCC {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".productoffer__availability .productoffer__deliverymsg .icon").first().length >= 1) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Alternate {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".productMainContainerRow .stockStatus.available_stock").length >= 1) {
if ($(".productMainContainerRow .stockStatus.available_stock").first().html().trim().toLocaleLowerCase() >= "direct leverbaar") {
return true;
} else {
return false;
}
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Expert {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product__view .product__add-to-cart").length >= 1) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Conrad {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-summary .availability__status")) {
if ($(".product-summary .availability__status").first().html().trim().toLowerCase() == "available") {
return true;
}
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Silvergear {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product.type-product').first().hasClass("instock")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace AlsaNature {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product .inventoryStatus .icon').first().hasClass("icon-in_stock")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Dobey {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("#product_view #stock_indicator").hasClass("stock_green")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace EzyDog {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-shop .availability").first().hasClass("in-stock")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace K9Shop {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".ty-qty-in-stock i").first().hasClass("ty-icon-ok")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace OnlineHondenSpeciaalZaak {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("#Product #StockContainer #ShowOutOfStock").hasClass("hidden")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace MacroVet {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product-essential meta[itemprop=availability]').attr("href") == "http://schema.org/InStock") {
return true;
}
if ($(".row-products-blog .row-products-blog2 .active link").attr("href") == "http://schema.org/InStock") {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace PetDuka {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("#product .hurry i").first().hasClass("icon-check-white")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace PetsOnline {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("#content #product_configure_form .overlay-e i").first().hasClass("icon-check-circle")) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace PetsPlace {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-info-main #product-addtocart-button").length >= 1) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace ZooEnzo {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".row.main_content .availability .available-now link").attr("href") == "http://schema.org/InStock") {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace ThePetEmpire {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("#content .form-product .add-to-cart-button").length >= 1) {
return true;
} else {
return false;
}
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace HuisdierExpress {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("[itemprop=availability]").first().attr("content") == "in_stock") {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace HondenBed {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("[itemprop=availability]").first().attr("content") == "http://schema.org/InStock") {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Coolblue {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
@@ -493,7 +61,7 @@ export namespace Coolblue {
if (json["@type"]) {
if (json["@type"] == "Product") {
if (json["offers"]["availability"] == "http://schema.org/InStock") {
if (json["offers"]["availability"].includes("InStock")) {
stock = true;
return;
}
@@ -528,43 +96,6 @@ export namespace WifiMedia {
}
}
export namespace Ep {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product__info").find(".is-green").html()) {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Kinq {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-detail-infomation").find(".stock")) {
if ($(".product-detail-infomation").find(".stock").first().attr("style") == "color: #2ace48;") {
return true;
}
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Maxiaxi {
export async function check(html: string) {
@@ -592,50 +123,4 @@ export namespace Maxiaxi {
return false;
}
}
}
export namespace FotoDeVakman {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
let stock = false;
$('script[type="application/ld+json"]').each((index, element) => {
let json = JSON.parse($(element).html());
if (json["@type"]) {
if (json["@type"] == "Product") {
if (json["offers"]["availability"] == "http://schema.org/InStock") {
stock = true;
return;
}
}
}
});
return stock;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Kabels {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-info-main").find(".stock .available").first()) {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}