记一次node爬虫经历

本文最后更新于： 2022年10月10日下午

需求说明

访问网站
获取列表页数据
循环列表页数据获取每条数据对应的详情页数据
- 机构名称
- 机构类型
- 机构性质
- 联系人
- 固定电话
- 联系电话
- 联系地址
导出 excel

开发环境

Mac OS v10.15.7
node v12.16.1
用到 npm 包

"dependencies": {
  "async": "^3.2.0",
  "cheerio": "^1.0.0-rc.5",
  "json2xls": "^0.1.2",
  "nodemon": "^2.0.7",
  "puppeteer-core": "^8.0.0",
  "request": "^2.88.2"
}

准备工作

cheerio 学习操作语法基本跟 jQuery 一致
request 请求模块的基本使用
puppeteer 无头浏览器的基本使用
node fs 文件模块的基本使用

遇到的问题:

页面中的数据是 ajax 加载出来的

刚开始使用 request 直接请求页面，发现响应回来的 html 文档并不是完整的，页面上有 ajax 请求，动态生成了一部分 DOM , 我原本想分析它这个 ajax 请求的接口，发现请求地址上有个查询字符串,像这样：
?ajaxtype=yanglaoxx_showlianxi&rand=0.17592822223231708
不懂这个 rand 值是怎么计算出来的，实在不想再去扒网站 js 的代码，于是想到用无头浏览器来渲染完整网页；

这里我使用 puppeteer 渲染完整的网页

await page.goto(url, {
  waitUntil: "networkidle0", // 等待网页上所有网络请求结束
});
const html = await page.content();
fs.writeFileSync("./loacl.html", html); // 将 puppeteer 请求回来的页面另存为本地的html文件，方便查看页面是否完整

我只安装了 puppeteer-core 核心，使用 puppeteer-core需要手动指定已安装的 Chrome 浏览器的安装路径。
Mac电脑上Chrome浏览器的的安装路径，可以通过在浏览器中输入 chrome:\\version 来查看。

const browser = await puppeteer.launch({
  executablePath:
    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
  headless: true,
  // args: [ '--proxy-server=http://188.166.215.141:3128' ] // 用你的有效代理服务器替换
});

这是 puppeteer 返回的完整 html 结构，包含了 ajax 请求动态生成的 DOM 结构:
4YZvCU

我现在遇到一个问题：在 puppeteer 中使用 cheerio 解析 html 文档和在浏览器使用 jquery 的表现不同。

疑问：这两种 dom 选择有区别吗？

1
2

$("#ContactUs > .leftcontext:eq(0)  > .leftcontexttitle:eq(3)").text(); // cheerio 解析不到,浏览器中 jquery 可以正常执行
$("#ContactUs > .leftcontext:eq(0)").children(".leftcontexttitle:eq(3)").text(); //  cheerio 解析成功

网站反爬

遇到 pyppeteer.errors.TimeoutError: Navigation Timeout Exceeded: 30000 ms exceeded
爬取详情页时，访问次数多了 puppeteer 就会报错响应超时我的思路是让 puppeteer 打开页面前等待 3-10 秒,同时让 puppeteer 等待网站响应的时间无限长

const sleep = (time) =>
  new Promise((resolve, reject) => {
    setTimeout(() => {
      resolve(true);
    }, time);
  });
function getRandom(n, m) {
  var num = Math.floor(Math.random() * (m - n + 1) + n);
  return num;
}
//
await sleep(getRandom(3000, 10000));
await page.goto(url, {
  waitUntil: "load", // 这里设置成 load 和 networkidle0 的表现一致，要研究一下两种的区别
  timeout: 0, // 可以等待页面加载的时间，超出就会报错请求超时 填 0 无限等待
});

需要再去研究的

load networkidle0 domcontentloaded 三者的区别？
在 puppeteer 中使用 cheerio 解析 html 文档和在浏览器使用 jquery 的表现不同
node 中异步流程控制与错误处理

开始爬取需要的数据

爬取中
iRuOZ9
导出为 excel
p0wkT2

其实就是一个功能及其简单的玩具爬虫

完整的代码去掉注释不到 100 行

循环列表页面得到跳转到详情页的地址整理成一个数组
循环得到的数组，依次访问对应的页面，将需要的详情页上的数据整理成数组
导出数组为 excel

// 请求模块（1.访问网站）
const request = require("request");
// 可以看做成node版的jQuery（2.获取页面指定数据源）
const cheerio = require("cheerio");
// node异步流程控制 异步循环（3.根据页面数据源再访问详情数据）
const async = require("async");
const fs = require("fs");
// 无头浏览器
const puppeteer = require("puppeteer-core");
// json 导出 excel
const json2xls = require("json2xls");
const sleep = (time) =>
  new Promise((resolve, reject) => {
    setTimeout(() => {
      resolve(true);
    }, time);
  });
function getRandom(n, m) {
  var num = Math.floor(Math.random() * (m - n + 1) + n);
  return num;
}
async function requestInfo(url) {
  console.log("开始请求详情页", url);
  //模拟人为操作 打开页面后等待3-10秒
  await sleep(getRandom(3000, 10000));
  const browser = await puppeteer.launch({
    executablePath:
      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
    headless: true,
    // args: [ '--proxy-server=http://188.166.215.141:3128' ] // 用你的有效代理服务器替换
  });
  const page = await browser.newPage();
  await page.goto(url, {
    waitUntil: "load",
    timeout: 0, // 可以等待页面加载的时间，超出就会报错请求超时 填 0 无限等待
  });
  const html = await page.content();
  // fs.writeFileSync('./test.html', html)
  const $ = cheerio.load(html);
  let info = {};
  info.机构名称 = $(
    "#BasicInformation > .leftcontext_left > .leftcontexttitle > label"
  )
    .text()
    .trim();
  // .... 这里处理需要的数据
  info.数据来源 = url;
  browser.close();
  console.log("info==", info);
  return info;
}

async function requestPage(page = 1, callback) {
  console.log("开始请求列表页面", page);
  request(
    {
      url: "你的目标页面" + page,
      method: "get",
      headers: {
        "User-Agent":
          "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
        Accept:
          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        // 'Accept-Encoding': 'gzip, deflate',
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
      },
      encoding: null,
    },
    async (err, res, body) => {
      const list = [];
      const $ = cheerio.load(body.toString());
      // 获取指定元素
      let item = $("#yly_list_div  div[class='jiadiantujianjie_but']");
      // 循环得到元素的跳转地址和名称
      item.map((i, index) => {
        let obj = {};
        obj.link = $(index).children("a").attr("href");
        list.push(obj);
      });
      callback(null, list);
    }
  );
}
async function main() {
  const requestList = [];
  for (let i = 1; i <= 20; i++) {
    requestList.push((callback) => {
      requestPage(i, callback);
    });
  }
  console.log("requestList", requestList); // [Function, Function] 全是function的数组
  async.series(requestList, (err, result) => {
    // async.series 我不会用
    const arry = [].concat.apply([], result);
    const infoList = [];
    (async () => {
      for (let index = 0; index < arry.length; index++) {
        const reust = await requestInfo(arry[index].link);
        infoList.push(reust);
      }
      const xls = json2xls(infoList);
      // 将返回结果转成excel
      fs.writeFileSync("成都.xlsx", xls, "binary");
      process.exit();
    })();
  });
}

main();

参考

代码人生

nodejs

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

记录从食指伸肌腱手术后甲根长期感染不愈合到二次手术的过程上一篇

我重新写博客了下一篇