Allisson Azevedo
'use strict';
const request = require('request');
const cheerio = require('cheerio');
request('http://jsday.com.br/speakers/', (error, response, body) => {
let $ = cheerio.load(body);
let speakers = [];
$('.people-modal').each((i, element) => {
let speaker = {};
speaker.title = $(element).find('h4').text();
speaker.description = $(element).find('.theme-description').text();
speaker.name = $(element).find('.name').contents()[0].data.trim();
speaker.about = $(element).find('.about').text();
speaker.image = $(element).find('.people-img').css('background-image').replace('url(', '').replace(')', '');
speakers.push(speaker);
});
console.log(JSON.stringify(speakers, null, 2));
});
'use strict';
const request = require('request');
const cheerio = require('cheerio');
request('http://jsday.com.br/schedule/', (error, response, body) => {
let $ = cheerio.load(body);
let subEvents = [];
$('.timeslot[itemtype="http://schema.org/subEvent"]').each((i, element) => {
let event = {};
event.title = $(element).find('.slot-title').text();
event.time = $(element).find('.start-time').attr('datetime');
subEvents.push(event);
});
console.log(JSON.stringify(subEvents, null, 2));
});
'use strict';
const request = require('request');
const cheerio = require('cheerio');
const startUrl = 'https://allissonazevedo.com/';
const hostname = new RegExp(startUrl);
let urlSet = new Set();
let pages = [];
function normalizeUrl(url) {
return url.split('?')[0].split('#')[0];
}
function verifyUrl(url) {
if (
url.match(/\.xml$/i) ||
url.match(/\/feed\/$/i) ||
url.match(/\/amp\/$/i)
) {
return false;
}
return true;
}
function getPage(url) {
url = normalizeUrl(url);
if (urlSet.has(url)) {
return;
}
if (!verifyUrl(url)) {
return;
}
urlSet.add(url);
request(url, (error, response, body) => {
if (error || response.statusCode != 200) {
return;
}
let $ = cheerio.load(body);
pages.push({ url: url, title: $('title').text() });
$('[href]').each((i, element) => {
let newUrl = normalizeUrl($(element).attr('href'));
if (hostname.test(newUrl)) {
getPage(newUrl);
}
});
});
};
getPage(startUrl);
process.on('exit', () => {
console.log(JSON.stringify(pages, null, 2));
});
'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('allissonazevedo.com');
const cheerio = require('cheerio');
let pages = [];
myCrawler.interval = 100;
myCrawler.maxConcurrency = 16;
myCrawler.stripQuerystring = true;
const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
if (
parsedURL.path.match(/\.xml$/i) ||
parsedURL.path.match(/\/feed\/$/i) ||
parsedURL.path.match(/\/amp\/$/i)
) {
return false;
}
return true;
});
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
let $ = cheerio.load(responseBuffer);
pages.push({ url: queueItem.url, title: $('title').text() });
});
myCrawler.start();
process.on('exit', () => {
console.log(JSON.stringify(pages, null, 2));
});
'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
console.log(queueItem.url);
});
myCrawler.start();
'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;
const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
if (
parsedURL.path.match(/\.ico$/i) ||
parsedURL.path.match(/\.css$/i) ||
parsedURL.path.match(/\.png$/i) ||
parsedURL.path.match(/\.gif$/i) ||
parsedURL.path.match(/\.jpg$/i) ||
parsedURL.path.match(/\.js$/i) ||
parsedURL.path.match(/print_recipe\.php/i) ||
parsedURL.path.match(/\/print$/i)
) {
return false;
}
return true;
});
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
console.log(queueItem.url);
});
myCrawler.start();
http://www.tudogostoso.com.br/receita/76147-anchova-assada.html
'use strict';
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;
const urls = [
'http://www.tudogostoso.com.br/',
'http://www.tudogostoso.com.br/categorias/bolos-e-tortas-doces.php',
'http://www.tudogostoso.com.br/receita/179236-petit-gateau-de-nutella-perfeito.html'
];
for (let url of urls) {
if (re.test(url)) {
console.log(url + ' é uma receita.');
} else {
console.log(url + ' não é uma receita');
}
}
'use strict';
const cheerio = require('cheerio');
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;
myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;
const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
if (
parsedURL.path.match(/\.ico$/i) ||
parsedURL.path.match(/\.css$/i) ||
parsedURL.path.match(/\.png$/i) ||
parsedURL.path.match(/\.gif$/i) ||
parsedURL.path.match(/\.jpg$/i) ||
parsedURL.path.match(/\.js$/i) ||
parsedURL.path.match(/print_recipe\.php/i) ||
parsedURL.path.match(/\/print$/i)
) {
return false;
}
return true;
});
function parseResponse(queueItem, responseBuffer) {
let $ = cheerio.load(responseBuffer);
let recipe = {};
recipe.url = queueItem.url;
recipe.name = $('.recipe-title h1').text().trim();
recipe.image = $('.photo.pic.u-photo').attr('src');
recipe.ingredients = [];
$('.p-ingredient').each((i, element) => {
recipe.ingredients.push($(element).text());
});
recipe.instructions = [];
$('.instructions.e-instructions li').each((i, element) => {
recipe.instructions.push($(element).text());
});
recipe.yield = $('.p-yield.num yield').attr('value');
recipe.preptime = $('.dt-duration').attr('datetime');
return recipe;
}
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
if (!re.test(queueItem.url)) {
return;
}
let recipe = parseResponse(queueItem, responseBuffer);
console.log(recipe);
});
myCrawler.start();
'use strict';
const cheerio = require('cheerio');
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;
const elasticsearch = require('elasticsearch');
const client = new elasticsearch.Client({
host: 'localhost:9200',
log: 'trace'
});
myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;
const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
if (
parsedURL.path.match(/\.ico$/i) ||
parsedURL.path.match(/\.css$/i) ||
parsedURL.path.match(/\.png$/i) ||
parsedURL.path.match(/\.gif$/i) ||
parsedURL.path.match(/\.jpg$/i) ||
parsedURL.path.match(/\.js$/i) ||
parsedURL.path.match(/print_recipe\.php/i) ||
parsedURL.path.match(/\/print$/i)
) {
return false;
}
return true;
});
function parseResponse(queueItem, responseBuffer) {
let $ = cheerio.load(responseBuffer);
let recipe = {};
recipe.url = queueItem.url;
recipe.name = $('.recipe-title h1').text().trim();
recipe.image = $('.photo.pic.u-photo').attr('src');
recipe.ingredients = [];
$('.p-ingredient').each((i, element) => {
recipe.ingredients.push($(element).text());
});
recipe.instructions = [];
$('.instructions.e-instructions li').each((i, element) => {
recipe.instructions.push($(element).text());
});
recipe.yield = $('.p-yield.num yield').attr('value');
recipe.preptime = $('.dt-duration').attr('datetime');
return recipe;
}
function indexRecipe(recipe) {
recipe.image = recipe.image.split('?')[0];
recipe.image = recipe.image + '?mode=crop&width=350&height=230'
client.index({
index: 'crawler',
type: 'recipe',
body: recipe
}, (error, response) => {
if (error) {
console.log(error);
} else {
console.log(response);
}
});
};
myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
if (!re.test(queueItem.url)) {
return;
}
let recipe = parseResponse(queueItem, responseBuffer);
if (!recipe.image) {
return;
}
indexRecipe(recipe);
});
myCrawler.start();