JavaScript 可以爬虫吗?

网络爬虫 (又被称为网页蜘蛛, 网络机器人), 是一种按照一定的规则, 自动地抓取万维网信息的程序或者脚本.

JavaScript 爬虫:

爬虫, 大多人对于爬虫的理解都停留在使用后端语言如 Python 写的爬虫. 但是实际上, 使用客户端 JavaScript 有诸多后端爬虫所无法拥有的优势:

可以方便的分享给其他人用, 只要对方电脑里有浏览器

由于跑在客户端, 几乎可以无视对方网站的反爬虫机制

可以拥有完善的 UI, 无开发基础的小白也可以随意使用

1, 爬虫相关的包

(1) 处理 get post put delete head 请求轻量接 http 请求库, 模仿浏览器登陆

const request = require('superagent');

(2) 加载 html

const cheerio = require('cheerio');

(3) 加载文件系统模块将数据存到一个文件中的时候会用到

const fs = require('fs');
   fs.writeFile('saveFiles/zybl.txt', content, (error1) => { // 将文件存起来文件路径要存的内容错误
   if (error1) throw error1;             // console.log('text save');
     });

(4) 将文件存为 xlse

const fs = require('graceful-fs');

新建 xlsx 文件

const writeStream = fs.createWriteStream('saveFiles/trader.xlsx');

向 slsx 里面写入内容

writeStream.write(title);

(5) 异步处理

const Promise = require('bluebird');

(6) 一个高层次的浏览器自动化图书馆先要安装 phantomjs 然后在装 nightmare

const Nightmare = require('nightmare');
(7)const co = require('co');

2, 爬虫代码

'use strict';
const co = require('co');
const fs = require('fs');
const Nightmare = require('nightmare'); // 可视化的浏览器
const url = 'http://sports.qq.com/isocce/';
const onError = function(err) {
    console.log(err);
};
const getHtml = function(pageUrl) {
    const pageScraper = new Nightmare(); // 打开浏览器
    let content = null;
    return co(function * run() {
        yield pageScraper.goto(pageUrl.url).wait();
        console.log('222222' + pageUrl.url);
        content = yield pageScraper.evaluate(() =>{
            const temp = document.querySelector('body').innerHTML;
            return temp;
        });
        console.log('子页面链接');
        console.dir(content);
        yield fs.writeFile('../../saveFiles/' + pageUrl.title + '.html', content, (err) =>{
            console.log('存文件.......');
            if (err) return console.log(err);
            return console.log('Save pageUrl content to' + pageUrl.title + '.html');
        });
    });
};
co(function * run() {
    const scraper = new Nightmare({
        show: true
    }); // 打开一个可视化的浏览器
    let counter = 0;
    // let next = null;
    let links = [];
    yield scraper.goto(url) // 跳转的地址
    .wait();
    // .click('#feed-laliga> a');
    for (let i = 0; i <5; i++) {
        yield scraper.wait(2000).click('#feed-laliga> a');
    }
    links = yield scraper.evaluate(() =>{
        const temp = document.querySelectorAll('#feed-laliga h3> a');
        const list = [];
        for (const each of temp) {
            console.log('each');
            console.log(each);
            list.push({
                title: each.innerText,
                url: each.href,
            });
        }
        return list;
    });
    // 在这里 加载更多
    console.log('这里');
    console.dir(links);
    for (const link of links) {
        if (link !== null && link.url !== 'javascript:void(0)') {
            counter += 1;
            setTimeout(() =>{
                getHtml(link);
            },
            counter * links.length * 250);
        }
    }
    yield scraper.end();
}).
catch(onError);

来源: http://www.css88.com/qa/javascript/11626.html

与本文相关文章

暂无,快来抢沙发吧！