Crawleando sites com NodeJS

JSday Campina Grande 2016

Allisson Azevedo

Allisson Azevedo

allissonazevedo.com

youtube.com/user/allissonazevedo

github.com/allisson

twitter.com/allisson

allisson.github.io/slides/

allisson@gmail.com

Objetivo

Web Crawler / Robot / Spider

  • Um programa que navega por toda a rede de maneira automática
  • Googlebot, BingBot, Yahoo! Slurp, Baiduspider
  • Opção quando não houver acesso aos dados via Web API

Funcionamento

  1. Carrega url
  2. Parser do conteúdo
  3. Carrega novas urls a partir dos links da atual

Ferramentas

  • npm install request
  • npm install cheerio
  • npm install simplecrawler

Speakers do JSday


'use strict';
const request = require('request');
const cheerio = require('cheerio');

request('http://jsday.com.br/speakers/', (error, response, body) => {
  let $ = cheerio.load(body);
  let speakers = [];
  $('.people-modal').each((i, element) => {
    let speaker = {};
    speaker.title = $(element).find('h4').text();
    speaker.description = $(element).find('.theme-description').text();
    speaker.name = $(element).find('.name').contents()[0].data.trim();
    speaker.about = $(element).find('.about').text();
    speaker.image = $(element).find('.people-img').css('background-image').replace('url(', '').replace(')', '');
    speakers.push(speaker);
  });
  console.log(JSON.stringify(speakers, null, 2));
});
                

Programação do JSday


'use strict';
const request = require('request');
const cheerio = require('cheerio');

request('http://jsday.com.br/schedule/', (error, response, body) => {
  let $ = cheerio.load(body);
  let subEvents = [];
  $('.timeslot[itemtype="http://schema.org/subEvent"]').each((i, element) => {
    let event = {};
    event.title = $(element).find('.slot-title').text();
    event.time = $(element).find('.start-time').attr('datetime');
    subEvents.push(event);
  });
  console.log(JSON.stringify(subEvents, null, 2));
});
                

Crawler simples


'use strict';
const request = require('request');
const cheerio = require('cheerio');
const startUrl = 'https://allissonazevedo.com/';
const hostname = new RegExp(startUrl);
let urlSet = new Set();
let pages = [];

function normalizeUrl(url) {
  return url.split('?')[0].split('#')[0];
}

function verifyUrl(url) {
  if (
    url.match(/\.xml$/i) ||
    url.match(/\/feed\/$/i) ||
    url.match(/\/amp\/$/i)
  ) {
    return false;
  }
  return true;
}

function getPage(url) {
  url = normalizeUrl(url);
  if (urlSet.has(url)) {
    return;
  }
  if (!verifyUrl(url)) {
    return;
  }
  urlSet.add(url);
  request(url, (error, response, body) => {
    if (error || response.statusCode != 200) {
      return;
    }
    let $ = cheerio.load(body);
    pages.push({ url: url, title: $('title').text() });
    $('[href]').each((i, element) => {
      let newUrl = normalizeUrl($(element).attr('href'));
      if (hostname.test(newUrl)) {
        getPage(newUrl);
      }
    });

  });
};

getPage(startUrl);

process.on('exit', () => {
  console.log(JSON.stringify(pages, null, 2));
});
                

Simplecrawler


'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('allissonazevedo.com');
const cheerio = require('cheerio');
let pages = [];

myCrawler.interval = 100;
myCrawler.maxConcurrency = 16;
myCrawler.stripQuerystring = true;

const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
  if (
    parsedURL.path.match(/\.xml$/i) ||
    parsedURL.path.match(/\/feed\/$/i) ||
    parsedURL.path.match(/\/amp\/$/i)
  ) {
    return false;
  }
  return true;
});

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  let $ = cheerio.load(responseBuffer);
  pages.push({ url: queueItem.url, title: $('title').text() });
});

myCrawler.start();

process.on('exit', () => {
  console.log(JSON.stringify(pages, null, 2));
});
                

Crawleando o Tudogostoso

  1. Verificar as urls que são carregadas
  2. Identificar as urls que são receitas
  3. Parser e indexação no Elasticsearch

Verificando urls


'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');

myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  console.log(queueItem.url);
});

myCrawler.start();
                

Urls que devemos evitar

http://www.tudogostoso.com.br/favicon-v2.1.ico
http://www.tudogostoso.com.br/app/assets/stylesheets/ie.css
http://www.tudogostoso.com.br/images/layout/logo-v4.png
http://www.tudogostoso.com.br/assets/layout/blank.gif
http://www.tudogostoso.com.br/imagens/renew/footer-bg.jpg
http://www.tudogostoso.com.br/dicas/10-pontos-do-brigadeiro/print
http://www.tudogostoso.com.br/receita/print_recipe.php?recipe_id=2721
http://www.tudogostoso.com.br/receita/4683/comentarios.js

Evitando o download de urls


'use strict';
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');

myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;

const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
  if (
    parsedURL.path.match(/\.ico$/i) ||
    parsedURL.path.match(/\.css$/i) ||
    parsedURL.path.match(/\.png$/i) ||
    parsedURL.path.match(/\.gif$/i) ||
    parsedURL.path.match(/\.jpg$/i) ||
    parsedURL.path.match(/\.js$/i) ||
    parsedURL.path.match(/print_recipe\.php/i) ||
    parsedURL.path.match(/\/print$/i)
  ) {
    return false;
  }
  return true;
});

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  console.log(queueItem.url);
});

myCrawler.start();
                

Urls de receitas

http://www.tudogostoso.com.br/receita/76147-anchova-assada.html


'use strict';
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;
const urls = [
  'http://www.tudogostoso.com.br/',
  'http://www.tudogostoso.com.br/categorias/bolos-e-tortas-doces.php',
  'http://www.tudogostoso.com.br/receita/179236-petit-gateau-de-nutella-perfeito.html'
];
for (let url of urls) {
  if (re.test(url)) {
    console.log(url + ' é uma receita.');
  } else {
    console.log(url + ' não é uma receita');
  }
}
                

Parser da receita


'use strict';
const cheerio = require('cheerio');
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;

myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;

const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
  if (
    parsedURL.path.match(/\.ico$/i) ||
    parsedURL.path.match(/\.css$/i) ||
    parsedURL.path.match(/\.png$/i) ||
    parsedURL.path.match(/\.gif$/i) ||
    parsedURL.path.match(/\.jpg$/i) ||
    parsedURL.path.match(/\.js$/i) ||
    parsedURL.path.match(/print_recipe\.php/i) ||
    parsedURL.path.match(/\/print$/i)
  ) {
    return false;
  }
  return true;
});

function parseResponse(queueItem, responseBuffer) {
  let $ = cheerio.load(responseBuffer);
  let recipe = {};
  recipe.url = queueItem.url;
  recipe.name = $('.recipe-title h1').text().trim();
  recipe.image = $('.photo.pic.u-photo').attr('src');
  recipe.ingredients = [];
  $('.p-ingredient').each((i, element) => {
    recipe.ingredients.push($(element).text());
  });
  recipe.instructions = [];
  $('.instructions.e-instructions li').each((i, element) => {
    recipe.instructions.push($(element).text());
  });
  recipe.yield = $('.p-yield.num yield').attr('value');
  recipe.preptime = $('.dt-duration').attr('datetime');
  return recipe;
}

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  if (!re.test(queueItem.url)) {
    return;
  }
  let recipe = parseResponse(queueItem, responseBuffer);
  console.log(recipe);
});

myCrawler.start();
                

Indexando no Elasticsearch


'use strict';
const cheerio = require('cheerio');
const Crawler = require('simplecrawler');
const myCrawler = new Crawler('www.tudogostoso.com.br');
const re = /\/receita\/([0-9]+)-([\w-]+)\.html$/i;
const elasticsearch = require('elasticsearch');
const client = new elasticsearch.Client({
  host: 'localhost:9200',
  log: 'trace'
});

myCrawler.interval = 100;
myCrawler.stripQuerystring = true;
myCrawler.maxConcurrency = 16;

const verifyUrl = myCrawler.addFetchCondition((parsedURL, queueItem) => {
  if (
    parsedURL.path.match(/\.ico$/i) ||
    parsedURL.path.match(/\.css$/i) ||
    parsedURL.path.match(/\.png$/i) ||
    parsedURL.path.match(/\.gif$/i) ||
    parsedURL.path.match(/\.jpg$/i) ||
    parsedURL.path.match(/\.js$/i) ||
    parsedURL.path.match(/print_recipe\.php/i) ||
    parsedURL.path.match(/\/print$/i)
  ) {
    return false;
  }
  return true;
});

function parseResponse(queueItem, responseBuffer) {
  let $ = cheerio.load(responseBuffer);
  let recipe = {};
  recipe.url = queueItem.url;
  recipe.name = $('.recipe-title h1').text().trim();
  recipe.image = $('.photo.pic.u-photo').attr('src');
  recipe.ingredients = [];
  $('.p-ingredient').each((i, element) => {
    recipe.ingredients.push($(element).text());
  });
  recipe.instructions = [];
  $('.instructions.e-instructions li').each((i, element) => {
    recipe.instructions.push($(element).text());
  });
  recipe.yield = $('.p-yield.num yield').attr('value');
  recipe.preptime = $('.dt-duration').attr('datetime');
  return recipe;
}

function indexRecipe(recipe) {
  recipe.image = recipe.image.split('?')[0];
  recipe.image = recipe.image + '?mode=crop&width=350&height=230'
  client.index({
    index: 'crawler',
    type: 'recipe',
    body: recipe
  }, (error, response) => {
    if (error) {
      console.log(error);
    } else {
      console.log(response);
    }
  });
};

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  if (!re.test(queueItem.url)) {
    return;
  }
  let recipe = parseResponse(queueItem, responseBuffer);
  if (!recipe.image) {
    return;
  }
  indexRecipe(recipe);
});

myCrawler.start();
                

Demo do WebApp

Obrigado!