Added more webshop support, updated system to use csv writer.
This commit is contained in:
167
src/core.ts
167
src/core.ts
@@ -5,28 +5,30 @@ import fs from "fs";
|
||||
import { getDomain } from "tldts";
|
||||
import { Browser, launch } from "puppeteer";
|
||||
import axios from "axios";
|
||||
import { Dobey, LDJsonParser } from "./modules/websites";
|
||||
import { Dobey, EP, LDJsonParser, Macrovet } from "./modules/websites";
|
||||
import { createObjectCsvWriter } from "csv-writer";
|
||||
import { CsvWriter } from "csv-writer/src/lib/csv-writer";
|
||||
import { ObjectMap } from "csv-writer/src/lib/lang/object";
|
||||
|
||||
interface IProduct {
|
||||
name: string;
|
||||
blogUrl: string;
|
||||
url: string;
|
||||
domain: string;
|
||||
inStock: boolean;
|
||||
product_name: string;
|
||||
product_url: string;
|
||||
in_stock: boolean;
|
||||
blog_url: string;
|
||||
};
|
||||
|
||||
// Globals
|
||||
let browser: Browser = null;
|
||||
let sitemapUrl: string = "";
|
||||
let allProducts: Array<IProduct> = [];
|
||||
let allBlogUrls: Array<string> = [];
|
||||
let writer: CsvWriter<ObjectMap<any>> = null;
|
||||
|
||||
// debugStart();
|
||||
start();
|
||||
|
||||
async function debugStart() {
|
||||
browser = await launch({ headless: false });
|
||||
console.log(await crawlProductStock("https://www.kabels.nl/nl_nl/ewent-ew3240-draadloze-multi-connect-muis-600-2400-dpi-zwart-34863350.html"));
|
||||
console.log(await crawlProductStock("https://www.macrovet.nl/K-Othrine-7-5-SC-1-liter-Insectenbestrijdingsmiddel/202561"));
|
||||
}
|
||||
|
||||
async function start() {
|
||||
@@ -61,7 +63,7 @@ async function start() {
|
||||
|
||||
|
||||
for (const sitemapUrl of sitemapUrls) {
|
||||
console.log(`Crawling crawled sitemap: ${sitemapUrl}`);
|
||||
console.log(`Crawling found sitemap: ${sitemapUrl}`);
|
||||
const sitemapRequest = await axios.get(sitemapUrl);
|
||||
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
||||
|
||||
@@ -76,15 +78,30 @@ async function start() {
|
||||
|
||||
console.log(`Crawled ${urls.length} urls from sitemaps.`);
|
||||
|
||||
console.log(`Starting crawler browser..`);
|
||||
browser = await launch({ headless: true });
|
||||
console.log(`Initialized puppeteer browser.`);
|
||||
|
||||
const today: Date = new Date();
|
||||
const datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
|
||||
const domain: string = getDomain(sitemapUrl);
|
||||
const file = `${domain}_${datecode}_report.csv`;
|
||||
writer = createObjectCsvWriter({
|
||||
path: file,
|
||||
header: [
|
||||
{id: 'domain', title: 'WEBSHOP_DOMAIN'},
|
||||
{id: 'product_name', title: 'PRODUCT_NAME'},
|
||||
{id: 'product_url', title: 'PRODUCT_URL'},
|
||||
{id: 'in_stock', title: 'IN_STOCK'},
|
||||
{id: 'blog_url', title: 'BLOG_URL'}
|
||||
]
|
||||
});
|
||||
console.log(`Initialized csv writer.`);
|
||||
|
||||
console.log(`Initialization done.`);
|
||||
console.log(`------------------------------------- \n`);
|
||||
|
||||
for (let url of urls) {
|
||||
for (const url of urls) {
|
||||
try {
|
||||
allBlogUrls.push(url);
|
||||
await crawlUrl(url);
|
||||
} catch (error) {
|
||||
continue;
|
||||
@@ -92,9 +109,9 @@ async function start() {
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
await generateReport();
|
||||
console.log(`Finished crawling all urls. Saved report to ${file}`);
|
||||
} catch (error) {
|
||||
console.error(`An Error Occured!`, error);
|
||||
console.error(`A error occurred!`, error);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,21 +132,21 @@ async function crawlUrl(url: string) {
|
||||
// Load html in cheerio object
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let products: Array<any> = [];
|
||||
const products: Array<IProduct> = [];
|
||||
|
||||
if ($(".row-products").length >= 1) {
|
||||
console.log(`- Detected ${$(".row-products").length} content egg row type products.`);
|
||||
|
||||
$(".row-products .cegg-list-logo-title a").each((index, element) => {
|
||||
let productUrl = $(element).attr("href");
|
||||
let name = $(element).html().trim();
|
||||
const productUrl = $(element).attr("href");
|
||||
const name = $(element).html().trim();
|
||||
|
||||
products.push({
|
||||
name: name,
|
||||
blogUrl: url,
|
||||
url: productUrl,
|
||||
domain: "",
|
||||
inStock: false,
|
||||
product_name: name,
|
||||
product_url: productUrl,
|
||||
blog_url: url,
|
||||
in_stock: false,
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -142,47 +159,47 @@ async function crawlUrl(url: string) {
|
||||
let name = $(element).find("h2").first().html().trim();
|
||||
|
||||
products.push({
|
||||
name: name,
|
||||
blogUrl: url,
|
||||
url: productUrl,
|
||||
domain: "",
|
||||
inStock: false,
|
||||
product_name: name,
|
||||
product_url: productUrl,
|
||||
blog_url: url,
|
||||
in_stock: false,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
console.log("- Checking product stocks...");
|
||||
|
||||
for (let index in products) {
|
||||
for (const index in products) {
|
||||
try {
|
||||
let status = await crawlProductStock(products[index].url);
|
||||
const status = await crawlProductStock(products[index].product_url);
|
||||
|
||||
products[index].domain = status[0];
|
||||
products[index].url = status[1];
|
||||
products[index].inStock = status[2];
|
||||
products[index].product_url = status[1];
|
||||
products[index].in_stock = status[2];
|
||||
|
||||
if (products[index].inStock) {
|
||||
console.log(` [IN STOCK] ${products[index].name} - ${products[index].domain}`);
|
||||
if (products[index].in_stock) {
|
||||
console.log(` [IN STOCK] ${products[index].product_name} - ${products[index].domain}`);
|
||||
} else {
|
||||
console.log(` [OUT OF STOCK] ${products[index].name} - ${products[index].domain} - ${products[index].url}`);
|
||||
console.log(` [OUT OF STOCK] ${products[index].product_name} - ${products[index].domain}`);
|
||||
}
|
||||
|
||||
allProducts.push(products[index]);
|
||||
// Write to csv
|
||||
await writer.writeRecords([products[index]]);
|
||||
} catch (error) {
|
||||
console.log("- Skipping product...");
|
||||
console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log("");
|
||||
|
||||
} catch (error) {
|
||||
console.log("-- Error while trying to crawl page! Skipping...");
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlProductStock(url: string) {
|
||||
async function crawlProductStock(url: string): Promise<[string, string, boolean]> {
|
||||
try {
|
||||
// Open new page and goto url
|
||||
const page = await browser.newPage();
|
||||
@@ -218,7 +235,31 @@ async function crawlProductStock(url: string) {
|
||||
|
||||
case "petsonline.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "coolblue.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "bcc.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "azerty.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "cameranu.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "ep.nl":
|
||||
return [domain, page.url(), await EP.check(html)];
|
||||
|
||||
case "alternate.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "macrovet.nl":
|
||||
return [domain, page.url(), await Macrovet.check(html)];
|
||||
|
||||
case "ezydog.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
default:
|
||||
console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
|
||||
return [domain, page.url(), false];
|
||||
@@ -227,58 +268,4 @@ async function crawlProductStock(url: string) {
|
||||
console.error(error);
|
||||
console.log("-- Error while trying to crawl page! Skipping...");
|
||||
}
|
||||
}
|
||||
|
||||
async function generateReport() {
|
||||
console.log("Generating report...");
|
||||
|
||||
let today: Date = new Date();
|
||||
let datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
|
||||
let domain: string = getDomain(sitemapUrl);
|
||||
let file = `${domain}_${datecode}_report.txt`;
|
||||
|
||||
let totalProducts = 0;
|
||||
let totalOutStock = 0;
|
||||
for (let product of allProducts) {
|
||||
totalProducts++;
|
||||
|
||||
if (product.inStock == false) {
|
||||
totalOutStock++;
|
||||
}
|
||||
}
|
||||
|
||||
appendLn(file, `Content Egg Product Stock Crawler Report\n`);
|
||||
appendLn(file, `Total Blog Urls: ${allBlogUrls.length}\n`);
|
||||
appendLn(file, `Total Products: ${totalProducts}\n`);
|
||||
appendLn(file, `Total Out of Stock: ${totalOutStock}\n`);
|
||||
appendLn(file, ``);
|
||||
|
||||
let lastBlogUrl = "";
|
||||
for (let product of allProducts) {
|
||||
if (product.blogUrl != lastBlogUrl) {
|
||||
appendLn(file, `\n\n`);
|
||||
appendLn(file, `${product.blogUrl}\n`);
|
||||
appendLn(file, `----------------------------------\n`);
|
||||
}
|
||||
|
||||
if (product.inStock == false) {
|
||||
appendLn(file, `${product.domain} - ${product.name} - ${product.url}\n`);
|
||||
}
|
||||
|
||||
lastBlogUrl = product.blogUrl;
|
||||
}
|
||||
|
||||
console.log("Report generated.");
|
||||
}
|
||||
|
||||
function appendLn(file: string, text: string): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
fs.appendFile(file, text + "\n", (err) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
@@ -75,3 +75,37 @@ export namespace Dobey {
|
||||
}
|
||||
}
|
||||
|
||||
export namespace EP {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($(".product__info--stock_row").first().find("p").hasClass("is-green")) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export namespace Macrovet {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($(".product-detail-price-container .product-available-stock").first().text().includes("0 in voorraad")) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user