Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
9d0a0b732f | |||
fbf7c1c7d2 | |||
f34af7e673 | |||
84edf517d5 | |||
421155d3fa | |||
f5d85297b9 |
41
src/cli.ts
41
src/cli.ts
@ -5,7 +5,7 @@ import fs from "fs";
|
||||
import { getDomain } from "tldts";
|
||||
import { Browser, launch } from "puppeteer";
|
||||
import axios from "axios";
|
||||
import { Dobey, EP, LDJsonParser, Macrovet } from "./modules/websites";
|
||||
import { CoolBlue, Dectdirect, Dobey, EP, LDJsonParser, Macrovet, Mediamarkt } from "./modules/websites";
|
||||
import { createObjectCsvWriter } from "csv-writer";
|
||||
import { CsvWriter } from "csv-writer/src/lib/csv-writer";
|
||||
import { ObjectMap } from "csv-writer/src/lib/lang/object";
|
||||
@ -28,7 +28,10 @@ start();
|
||||
|
||||
async function debugStart() {
|
||||
browser = await launch({ headless: false });
|
||||
console.log(await crawlProductStock("https://www.hondenbed.nl/hondenkussen-taupe-bruin.html?utm_medium=affiliate&utm_source=tradetracker"));
|
||||
console.log(await crawlProductStock("https://www.dectdirect.nl/nl/unifi-switch-lite-8-poe-115502613.html"));
|
||||
// console.log(await crawlProductStock("https://www.mediamarkt.nl/nl/product/_apple-iphone-14-128gb-midnight-1738479.html"));
|
||||
// console.log(await crawlProductStock("https://www.coolblue.nl/product/923036/hp-deskjet-2720e-all-in-one.html?clickref=1101lwW9ebAE&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1101lwW9ebAE&utm_term=1100leWsm&ref=293530&PHGref=1101lwW9ebAE&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1101lwW9ebAE"));
|
||||
// console.log(await crawlProductStock("https://www.coolblue.nl/product/882996/canon-pixma-ts-3450-zwart.html?clickref=1011lwWDBFEL&utm_source=performancehorizon&utm_medium=affiliate&utm_campaign=Comparison%2FReview&utm_content=1011lwWDBFEL&utm_term=1100leWsm&ref=293530&PHGref=1011lwWDBFEL&cmt=c_ph%2Capm_Comparison%2FReview_%2Cacid1101l93%2Cacr_1100leWsm%2Caclr_1011lwWDBFEL"));
|
||||
}
|
||||
|
||||
async function start() {
|
||||
@ -69,7 +72,7 @@ async function start() {
|
||||
|
||||
|
||||
for (const sitemapUrl of sitemapUrls) {
|
||||
console.log(`Crawling found sitemap: ${sitemapUrl}`);
|
||||
console.log(`Crawling sitemap: ${sitemapUrl}`);
|
||||
const sitemapRequest = await axios.get(sitemapUrl);
|
||||
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
||||
|
||||
@ -82,13 +85,13 @@ async function start() {
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`Crawled ${urls.length} urls from sitemaps.`);
|
||||
console.log(`Found ${urls.length} urls from sitemaps.`);
|
||||
|
||||
browser = await launch({ headless: true });
|
||||
console.log(`Initialized puppeteer browser.`);
|
||||
|
||||
const today: Date = new Date();
|
||||
const datecode: string = today.getFullYear().toString() + "-" + (today.getMonth() + 1).toString() + "-" + today.getDay().toString();
|
||||
const datecode: string = `${today.getFullYear()}-${today.getMonth() + 1}-${today.getDate()}`;
|
||||
const domain: string = getDomain(sitemapUrl);
|
||||
const file = `${domain}_${datecode}_report.csv`;
|
||||
writer = createObjectCsvWriter({
|
||||
@ -134,7 +137,7 @@ async function crawlUrl(url: string) {
|
||||
await page.goto(url, { waitUntil: 'networkidle2' });
|
||||
|
||||
// Get html from page
|
||||
const html = await page.evaluate(() => document.body.innerHTML);
|
||||
const html = await page.evaluate(() => document.documentElement.outerHTML);
|
||||
|
||||
// Close page
|
||||
await page.close();
|
||||
@ -202,6 +205,8 @@ async function crawlUrl(url: string) {
|
||||
in_stock: products[index].in_stock,
|
||||
blog_url: products[index].blog_url,
|
||||
}]);
|
||||
|
||||
await wait(1000);
|
||||
} catch (error) {
|
||||
console.log(` [ERROR] ${products[index].product_name} - ${products[index].domain} - ${products[index].product_url}`, error);
|
||||
continue;
|
||||
@ -219,13 +224,11 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
|
||||
try {
|
||||
// Open new page and goto url
|
||||
const page = await browser.newPage();
|
||||
page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
|
||||
await page.goto(url, { waitUntil: 'networkidle2' });
|
||||
|
||||
// Get html from page
|
||||
const html = await page.evaluate(() => document.body.innerHTML);
|
||||
|
||||
// Load html in cheerio object
|
||||
const $ = cheerio.load(html);
|
||||
const html = await page.evaluate(() => document.documentElement.outerHTML);
|
||||
|
||||
// Get domain from url
|
||||
const domain = getDomain(page.url());
|
||||
@ -253,7 +256,7 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "coolblue.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
return [domain, page.url(), await CoolBlue.check(html)];
|
||||
|
||||
case "bcc.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
@ -276,6 +279,18 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
|
||||
case "ezydog.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "mediamarkt.nl":
|
||||
return [domain, page.url(), await Mediamarkt.check(html)];
|
||||
|
||||
case "dectdirect.nl":
|
||||
return [domain, page.url(), await Dectdirect.check(html)];
|
||||
|
||||
case "expert.nl":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
case "maxiaxi.com":
|
||||
return [domain, page.url(), await LDJsonParser.check(html)];
|
||||
|
||||
default:
|
||||
console.error(`-- ${domain} is not an supported website! Cannot check stock!`);
|
||||
return [domain, page.url(), false];
|
||||
@ -285,3 +300,7 @@ async function crawlProductStock(url: string): Promise<[string, string, boolean]
|
||||
console.log("-- Error while trying to crawl page! Skipping...");
|
||||
}
|
||||
}
|
||||
|
||||
function wait(ms: number) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
@ -20,7 +20,14 @@ export namespace LDJsonParser {
|
||||
const snippets: Array<string> = [];
|
||||
const $ = cheerio.load(html);
|
||||
$('script[type="application/ld+json"]').each((index, element) => {
|
||||
snippets.push($(element).html());
|
||||
let html = $(element).html();
|
||||
// remove all spaces
|
||||
html = html.replace(/\s/g, '');
|
||||
|
||||
// remove all newlines
|
||||
html = html.replace(/\n/g, '');
|
||||
|
||||
snippets.push(html);
|
||||
});
|
||||
|
||||
for (const snippet of snippets) {
|
||||
@ -43,6 +50,7 @@ export namespace LDJsonParser {
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -56,6 +64,84 @@ export namespace LDJsonParser {
|
||||
}
|
||||
}
|
||||
|
||||
export namespace CoolBlue {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const snippets: Array<string> = [];
|
||||
const $ = cheerio.load(html);
|
||||
$('script[type="application/ld+json"]').each((index, element) => {
|
||||
let html = $(element).html();
|
||||
// remove all spaces
|
||||
html = html.replace(/\s/g, '');
|
||||
|
||||
// remove all newlines
|
||||
html = html.replace(/\n/g, '');
|
||||
|
||||
snippets.push(html);
|
||||
});
|
||||
|
||||
for (const snippet of snippets) {
|
||||
try {
|
||||
if (snippet.includes(`"@type":"Product"`)) {
|
||||
if (snippet.includes(`"availability":"http://schema.org/InStock"`) || snippet.includes(`"availability":"https://schema.org/InStock"`)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export namespace Mediamarkt {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Check if data-test="mms-delivery-online-availability_AVAILABLE" exists
|
||||
if ($('[data-test="mms-delivery-online-availability_AVAILABLE"]').length > 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export namespace Dectdirect {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Check if .rounded-full .bg-green-500 class exists with text content op voorraad
|
||||
$('.js-product .rounded-full').each((index, element) => {
|
||||
if ($(element).text().includes("op voorraad")) {
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export namespace Dobey {
|
||||
export async function check(html: string) {
|
||||
|
7
update.linux.sh
Executable file
7
update.linux.sh
Executable file
@ -0,0 +1,7 @@
|
||||
echo "Updating crawler source..."
|
||||
git pull
|
||||
echo "Updating crawler source... DONE"
|
||||
echo ""
|
||||
echo "Updating crawler packages..."
|
||||
npm install
|
||||
echo "Updating crawler packages... DONE"
|
7
update.win.bat
Normal file
7
update.win.bat
Normal file
@ -0,0 +1,7 @@
|
||||
echo "Updating crawler source..."
|
||||
git pull
|
||||
echo "Updating crawler source... DONE"
|
||||
echo ""
|
||||
echo "Updating crawler packages..."
|
||||
npm install
|
||||
echo "Updating crawler packages... DONE"
|
Loading…
x
Reference in New Issue
Block a user