How this process is speeded up, and I want to re-enter the pages visited. so where the array variable. Not only will it be a lot to handle for a site, but the site :(
var Crawler = require("node-webcrawler");
var url = require('url');
var urls = [];
var c = new Crawler({
maxConnections : 1,
userAgent: 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.1.17) Gecko/20080829 Firefox/2.0.0.17',
//cache:true,
// This will be called for each crawled page
callback : function (error, result, $) {
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
if(error){
console.log(error);
}else{
//console.log($("title").text());
$( "a" ).each(function() {
var url = $( this ).attr( "href" );
spider(url);
});
}
}
});
var spider = function(url){
//console.log(urls.indexOf(url));
if(!url){
return false;
}
if((urls.indexOf(url)<0) && (url.indexOf("#")<0) && (url.indexOf("javascript:")<0)){
urls.push(url);
console.log(url);
c.queue(url);
}
}
// Queue just one URL, with default callback
c.queue('http://www.example.com');