added sitemap parser!

This commit is contained in:
Nick Leeman
2021-03-20 13:34:04 +01:00
parent 22aa81f2ee
commit 48fee9b86d
6 changed files with 279 additions and 163 deletions

View File

@@ -3,6 +3,7 @@ import jsonfile from "jsonfile";
import cheerio from "cheerio";
import fs from "fs-extra";
import { getDomain } from "tldts";
import got from "got";
// Import website modules
import {
@@ -26,7 +27,10 @@ import {
ZooEnzo,
PetsOnline,
PetsPlace,
ThePetEmpire
ThePetEmpire,
Conrad,
HuisdierExpress,
HondenBed
} from "./modules/websites";
start();
@@ -35,20 +39,30 @@ start();
let browser: puppeteer.Browser = null;
let allProducts: Array<any> = [];
let allBlogUrls: Array<any> = [];
let urlsFileName = "";
async function start() {
try {
console.log(`---- ContentEggStockCrawler ----`);
console.log(`---- ContentEggStockCrawler ----`);
let params = process.argv.slice(2);
// Crawl Sitemap
const sitemap = await got(params[0]);
const urls: Array<string> = [];
const $ = cheerio.load(sitemap.body, { xmlMode: true })
$('loc').each(function() {
const url = $(this).text()
if (!urls.includes(url)) {
urls.push(url)
}
});
console.log(`Loaded ${urls.length} urls from sitemap! (${params[0]})`);
console.log(`Starting crawler browser..`);
browser = await puppeteer.launch({ headless: true });
let params = process.argv.slice(2);
urlsFileName = params[0].replace(/\.[^/.]+$/, "");
const urls = await jsonfile.readFile(`./${params[0]}`);
console.log(`Loaded ${urls.length} urls from file! (${params[0]})`);
console.log(`Initialization done!`);
console.log(`------------------------------------- \n`);
@@ -133,9 +147,9 @@ async function crawlBlogPage(url: string) {
products[index].inStock = status[2];
if (products[index].inStock) {
console.log(` ${products[index].name} - ${products[index].domain}`);
console.log(`IN STOCK! ${products[index].name} - ${products[index].domain}`);
} else {
console.log(` ${products[index].name} - ${products[index].domain}`);
console.log(`OUT OF STOCK! ${products[index].name} - ${products[index].domain}`);
}
allProducts.push(products[index]);
@@ -208,6 +222,12 @@ async function crawlProductStock(url: string) {
case "silvergear.eu":
return [domain, page.url(), await Silvergear.check(html)];
case "conrad.com":
return [domain, page.url(), await Conrad.check(html)];
case "conrad.nl":
return [domain, page.url(), await Conrad.check(html)];
case "alsa-nature.nl":
return [domain, page.url(), await AlsaNature.check(html)];
@@ -240,6 +260,12 @@ async function crawlProductStock(url: string) {
case "thepetempire.com":
return [domain, page.url(), await ThePetEmpire.check(html)];
case "huisdierexpress.nl":
return [domain, page.url(), await HuisdierExpress.check(html)];
case "hondenbed.nl":
return [domain, page.url(), await HondenBed.check(html)];
default:
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
@@ -254,7 +280,7 @@ async function crawlProductStock(url: string) {
async function generateReport() {
console.log("Generating report...");
let file = `${urlsFileName}-report.txt`;
let file = `stock-report.txt`;
let totalProducts = 0;
let totalOutStock = 0;
for (let product of allProducts) {

View File

@@ -166,7 +166,7 @@ export namespace BCC {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".productoffer__availability .productoffer__deliverymsg .icon").length >= 1) {
if ($(".productoffer__availability .productoffer__deliverymsg .icon").first().length >= 1) {
return true;
} else {
return false;
@@ -183,7 +183,7 @@ export namespace Alternate {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".productMainContainerRow .stockStatus.available_stock")) {
if ($(".productMainContainerRow .stockStatus.available_stock").length >= 1) {
if ($(".productMainContainerRow .stockStatus.available_stock").first().html().trim().toLocaleLowerCase() >= "direct leverbaar") {
return true;
} else {
@@ -217,6 +217,26 @@ export namespace Expert {
}
}
export namespace Conrad {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($(".product-summary .availability__status")) {
if ($(".product-summary .availability__status").first().html().trim().toLowerCase() == "available") {
return true;
}
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Silvergear {
export async function check(html: string) {
try {
@@ -323,11 +343,15 @@ export namespace MacroVet {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product-essential meta[itemprop=availability]').attr("href") == "http://schema.org/InStock") {
return true;
}
if ($(".row-products-blog .row-products-blog2 .active link").attr("href") == "http://schema.org/InStock") {
return true;
} else {
return false;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
@@ -419,4 +443,40 @@ export namespace ThePetEmpire {
return false;
}
}
}
}
export namespace HuisdierExpress {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("[itemprop=availability]").first().attr("content") == "in_stock") {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace HondenBed {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($("[itemprop=availability]").first().attr("content") == "http://schema.org/InStock") {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}