added sitemap parser!
This commit is contained in:
50
src/core.ts
50
src/core.ts
@@ -3,6 +3,7 @@ import jsonfile from "jsonfile";
|
||||
import cheerio from "cheerio";
|
||||
import fs from "fs-extra";
|
||||
import { getDomain } from "tldts";
|
||||
import got from "got";
|
||||
|
||||
// Import website modules
|
||||
import {
|
||||
@@ -26,7 +27,10 @@ import {
|
||||
ZooEnzo,
|
||||
PetsOnline,
|
||||
PetsPlace,
|
||||
ThePetEmpire
|
||||
ThePetEmpire,
|
||||
Conrad,
|
||||
HuisdierExpress,
|
||||
HondenBed
|
||||
} from "./modules/websites";
|
||||
|
||||
start();
|
||||
@@ -35,20 +39,30 @@ start();
|
||||
let browser: puppeteer.Browser = null;
|
||||
let allProducts: Array<any> = [];
|
||||
let allBlogUrls: Array<any> = [];
|
||||
let urlsFileName = "";
|
||||
|
||||
async function start() {
|
||||
try {
|
||||
console.log(`---- ContentEggStockCrawler ----`);
|
||||
console.log(`---- ContentEggStockCrawler ----`);
|
||||
let params = process.argv.slice(2);
|
||||
|
||||
// Crawl Sitemap
|
||||
const sitemap = await got(params[0]);
|
||||
|
||||
const urls: Array<string> = [];
|
||||
const $ = cheerio.load(sitemap.body, { xmlMode: true })
|
||||
|
||||
$('loc').each(function() {
|
||||
const url = $(this).text()
|
||||
|
||||
if (!urls.includes(url)) {
|
||||
urls.push(url)
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Loaded ${urls.length} urls from sitemap! (${params[0]})`);
|
||||
|
||||
console.log(`Starting crawler browser..`);
|
||||
browser = await puppeteer.launch({ headless: true });
|
||||
|
||||
let params = process.argv.slice(2);
|
||||
urlsFileName = params[0].replace(/\.[^/.]+$/, "");
|
||||
|
||||
const urls = await jsonfile.readFile(`./${params[0]}`);
|
||||
console.log(`Loaded ${urls.length} urls from file! (${params[0]})`);
|
||||
|
||||
console.log(`Initialization done!`);
|
||||
console.log(`------------------------------------- \n`);
|
||||
@@ -133,9 +147,9 @@ async function crawlBlogPage(url: string) {
|
||||
products[index].inStock = status[2];
|
||||
|
||||
if (products[index].inStock) {
|
||||
console.log(`✅ ${products[index].name} - ${products[index].domain}`);
|
||||
console.log(`IN STOCK! ${products[index].name} - ${products[index].domain}`);
|
||||
} else {
|
||||
console.log(`❌ ${products[index].name} - ${products[index].domain}`);
|
||||
console.log(`OUT OF STOCK! ${products[index].name} - ${products[index].domain}`);
|
||||
}
|
||||
|
||||
allProducts.push(products[index]);
|
||||
@@ -208,6 +222,12 @@ async function crawlProductStock(url: string) {
|
||||
case "silvergear.eu":
|
||||
return [domain, page.url(), await Silvergear.check(html)];
|
||||
|
||||
case "conrad.com":
|
||||
return [domain, page.url(), await Conrad.check(html)];
|
||||
|
||||
case "conrad.nl":
|
||||
return [domain, page.url(), await Conrad.check(html)];
|
||||
|
||||
case "alsa-nature.nl":
|
||||
return [domain, page.url(), await AlsaNature.check(html)];
|
||||
|
||||
@@ -240,6 +260,12 @@ async function crawlProductStock(url: string) {
|
||||
|
||||
case "thepetempire.com":
|
||||
return [domain, page.url(), await ThePetEmpire.check(html)];
|
||||
|
||||
case "huisdierexpress.nl":
|
||||
return [domain, page.url(), await HuisdierExpress.check(html)];
|
||||
|
||||
case "hondenbed.nl":
|
||||
return [domain, page.url(), await HondenBed.check(html)];
|
||||
|
||||
default:
|
||||
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
|
||||
@@ -254,7 +280,7 @@ async function crawlProductStock(url: string) {
|
||||
async function generateReport() {
|
||||
console.log("Generating report...");
|
||||
|
||||
let file = `${urlsFileName}-report.txt`;
|
||||
let file = `stock-report.txt`;
|
||||
let totalProducts = 0;
|
||||
let totalOutStock = 0;
|
||||
for (let product of allProducts) {
|
||||
|
@@ -166,7 +166,7 @@ export namespace BCC {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($(".productoffer__availability .productoffer__deliverymsg .icon").length >= 1) {
|
||||
if ($(".productoffer__availability .productoffer__deliverymsg .icon").first().length >= 1) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
@@ -183,7 +183,7 @@ export namespace Alternate {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($(".productMainContainerRow .stockStatus.available_stock")) {
|
||||
if ($(".productMainContainerRow .stockStatus.available_stock").length >= 1) {
|
||||
if ($(".productMainContainerRow .stockStatus.available_stock").first().html().trim().toLocaleLowerCase() >= "direct leverbaar") {
|
||||
return true;
|
||||
} else {
|
||||
@@ -217,6 +217,26 @@ export namespace Expert {
|
||||
}
|
||||
}
|
||||
|
||||
export namespace Conrad {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
if ($(".product-summary .availability__status")) {
|
||||
if ($(".product-summary .availability__status").first().html().trim().toLowerCase() == "available") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export namespace Silvergear {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
@@ -323,11 +343,15 @@ export namespace MacroVet {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($('.product-essential meta[itemprop=availability]').attr("href") == "http://schema.org/InStock") {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ($(".row-products-blog .row-products-blog2 .active link").attr("href") == "http://schema.org/InStock") {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
@@ -419,4 +443,40 @@ export namespace ThePetEmpire {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export namespace HuisdierExpress {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
if ($("[itemprop=availability]").first().attr("content") == "in_stock") {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export namespace HondenBed {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
if ($("[itemprop=availability]").first().attr("content") == "http://schema.org/InStock") {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user