Compare commits

..

No commits in common. "master" and "1.0.0" have entirely different histories.

4 changed files with 12 additions and 131 deletions

View File

@ -5,7 +5,7 @@ import fs from "fs";
import { getDomain } from "tldts"; import { getDomain } from "tldts";
import { Browser, launch } from "puppeteer"; import { Browser, launch } from "puppeteer";
import axios from "axios"; import axios from "axios";
import { CoolBlue, Dectdirect, Dobey, EP, LDJsonParser, Macrovet, Mediamarkt } from "./modules/websites"; import { Dobey, EP, LDJsonParser, Macrovet } from "./modules/websites";
import { createObjectCsvWriter } from "csv-writer"; import { createObjectCsvWriter } from "csv-writer";
import { CsvWriter } from "csv-writer/src/lib/csv-writer"; import { CsvWriter } from "csv-writer/src/lib/csv-writer";
import { ObjectMap } from "csv-writer/src/lib/lang/object"; import { ObjectMap } from "csv-writer/src/lib/lang/object";
@ -28,10 +28,7 @@ start();
async function debugStart() { async function debugStart() {
browser = await launch({ headless: false }); browser = await launch({ headless: false });
console.log(await crawlProductStock("https://www.dectdirect.nl/nl/unifi-switch-lite-8-poe-115502613.html")); console.log(await crawlProductStock("https://www.hondenbed.nl/hondenkussen-taupe-bruin.html?utm_medium=affiliate&utm_source=tradetracker"));
// console.log(await crawlProductStock("https://www.mediamarkt.nl/nl/product/_apple-iphone-14-128gb-midnight-1738479.html"));
// console.log(await crawlProductStock("https://www.coolblue.nl/product/923036/hp-deskjet-2720e-all-in-one.html?clickref=1101lwW9ebAE&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1101lwW9ebAE&utm_term=1100leWsm&ref=293530&PHGref=1101lwW9ebAE&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1101lwW9ebAE"));
// console.log(await crawlProductStock("https://www.coolblue.nl/product/882996/canon-pixma-ts-3450-zwart.html?clickref=1011lwWDBFEL&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1011lwWDBFEL&utm_term=1100leWsm&ref=293530&PHGref=1011lwWDBFEL&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1011lwWDBFEL"));
} }
async function start() { async function start() {
@ -72,7 +69,7 @@ async function start() {
for (const sitemapUrl of sitemapUrls) { for (const sitemapUrl of sitemapUrls) {
console.log(`Crawling sitemap: ${sitemapUrl}`); console.log(`Crawling found sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl); const sitemapRequest = await axios.get(sitemapUrl);
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
@ -85,13 +82,13 @@ async function start() {
}); });
} }
console.log(`Found ${urls.length} urls from sitemaps.`); console.log(`Crawled ${urls.length} urls from sitemaps.`);
browser = await launch({ headless: true }); browser = await launch({ headless: true });
console.log(`Initialized puppeteer browser.`); console.log(`Initialized puppeteer browser.`);
const today: Date = new Date(); const today: Date = new Date();
const datecode: string = `${today.getFullYear()}-${today.getMonth() + 1}-${today.getDate()}`; const datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
const domain: string = getDomain(sitemapUrl); const domain: string = getDomain(sitemapUrl);
const file = `${domain}_${datecode}_report.csv`; const file = `${domain}_${datecode}_report.csv`;
writer = createObjectCsvWriter({ writer = createObjectCsvWriter({
@ -137,7 +134,7 @@ async function crawlUrl(url: string) {
await page.goto(url, { waitUntil: 'networkidle2' }); await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page // Get html from page
const html = await page.evaluate(() => document.documentElement.outerHTML); const html = await page.evaluate(() => document.body.innerHTML);
// Close page // Close page
await page.close(); await page.close();
@ -205,8 +202,6 @@ async function crawlUrl(url: string) {
in_stock: products[index].in_stock, in_stock: products[index].in_stock,
blog_url: products[index].blog_url, blog_url: products[index].blog_url,
}]); }]);
await wait(1000);
} catch (error) { } catch (error) {
console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error); console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error);
continue; continue;
@ -224,11 +219,13 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
try { try {
// Open new page and goto url // Open new page and goto url
const page = await browser.newPage(); const page = await browser.newPage();
page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
await page.goto(url, { waitUntil: 'networkidle2' }); await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page // Get html from page
const html = await page.evaluate(() => document.documentElement.outerHTML); const html = await page.evaluate(() => document.body.innerHTML);
// Load html in cheerio object
const $ = cheerio.load(html);
// Get domain from url // Get domain from url
const domain = getDomain(page.url()); const domain = getDomain(page.url());
@ -256,7 +253,7 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
return [domain, page.url(), await LDJsonParser.check(html)]; return [domain, page.url(), await LDJsonParser.check(html)];
case "coolblue.nl": case "coolblue.nl":
return [domain, page.url(), await CoolBlue.check(html)]; return [domain, page.url(), await LDJsonParser.check(html)];
case "bcc.nl": case "bcc.nl":
return [domain, page.url(), await LDJsonParser.check(html)]; return [domain, page.url(), await LDJsonParser.check(html)];
@ -279,18 +276,6 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
case "ezydog.nl": case "ezydog.nl":
return [domain, page.url(), await LDJsonParser.check(html)]; return [domain, page.url(), await LDJsonParser.check(html)];
case "mediamarkt.nl":
return [domain, page.url(), await Mediamarkt.check(html)];
case "dectdirect.nl":
return [domain, page.url(), await Dectdirect.check(html)];
case "expert.nl":
return [domain, page.url(), await LDJsonParser.check(html)];
case "maxiaxi.com":
return [domain, page.url(), await LDJsonParser.check(html)];
default: default:
console.error(`-- ${domain} is not an supported website! Cannot check stock!`); console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
return [domain, page.url(), false]; return [domain, page.url(), false];
@ -300,7 +285,3 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
console.log("-- Error while trying to crawl page! Skipping..."); console.log("-- Error while trying to crawl page! Skipping...");
} }
} }
function wait(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}

View File

@ -20,14 +20,7 @@ export namespace LDJsonParser {
const snippets: Array<string> = []; const snippets: Array<string> = [];
const $ = cheerio.load(html); const $ = cheerio.load(html);
$('script[type="application/ld+json"]').each((index, element) => { $('script[type="application/ld+json"]').each((index, element) => {
let html = $(element).html(); snippets.push($(element).html());
// remove all spaces
html = html.replace(/\s/g, '');
// remove all newlines
html = html.replace(/\n/g, '');
snippets.push(html);
}); });
for (const snippet of snippets) { for (const snippet of snippets) {
@ -50,7 +43,6 @@ export namespace LDJsonParser {
} }
} }
} catch (error) { } catch (error) {
console.log(error);
continue; continue;
} }
} }
@ -64,84 +56,6 @@ export namespace LDJsonParser {
} }
} }
export namespace CoolBlue {
export async function check(html: string) {
try {
const snippets: Array<string> = [];
const $ = cheerio.load(html);
$('script[type="application/ld+json"]').each((index, element) => {
let html = $(element).html();
// remove all spaces
html = html.replace(/\s/g, '');
// remove all newlines
html = html.replace(/\n/g, '');
snippets.push(html);
});
for (const snippet of snippets) {
try {
if (snippet.includes(`"@type":"Product"`)) {
if (snippet.includes(`"availability":"http://schema.org/InStock"`) || snippet.includes(`"availability":"https://schema.org/InStock"`)) {
return true;
}
}
} catch (error) {
console.log(error);
continue;
}
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Mediamarkt {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
// Check if data-test="mms-delivery-online-availability_AVAILABLE" exists
if ($('[data-test="mms-delivery-online-availability_AVAILABLE"]').length > 0) {
return true;
}
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Dectdirect {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
// Check if .rounded-full .bg-green-500 class exists with text content op voorraad
$('.js-product .rounded-full').each((index, element) => {
if ($(element).text().includes("op voorraad")) {
return true;
}
});
return false;
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace Dobey { export namespace Dobey {
export async function check(html: string) { export async function check(html: string) {

View File

@ -1,7 +0,0 @@
echo "Updating crawler source..."
git pull
echo "Updating crawler source... DONE"
echo ""
echo "Updating crawler packages..."
npm install
echo "Updating crawler packages... DONE"

View File

@ -1,7 +0,0 @@
echo "Updating crawler source..."
git pull
echo "Updating crawler source... DONE"
echo ""
echo "Updating crawler packages..."
npm install
echo "Updating crawler packages... DONE"