超简单的node爬虫小案例

前端爬取参数一样,输入三个参数进行爬取

注意点也一样:
注意分页的字段需要在代码里面定制化修改,根据你爬取的接口,他的业务规则改代码中的字段。比如我这里总条数叫total,人家的不一定。返回的数据我这里是data.rows,看看人家的是叫什么字段,改改代码。再比如我这里的分页叫pageNum,人家的可能叫pageNo


data目录手动建立上哦,要放爬下来的数据

依赖

1
2
3
4
5
6
7
8
9
{
"dependencies": {
"axios": "^1.6.5",
"colors": "^1.4.0",
"fs": "^0.0.1-security",
"readline": "^1.3.0"
}
}

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
const readline = require("readline");
const axios = require('axios');
const fs = require("fs");
const colors = require('colors');


// 创建询问实例
let RL = readline.createInterface({
input: process.stdin,
output: process.stdout
})

// 封装异步询问
function question(question) {
return new Promise((resolve, reject) => {
RL.question(`${question}\t`, function (value) {
return resolve(value);
})
})
}


var total = 0;
var pageNum = 1;
var pageSize = 30;
var api = "";
var headers = "";
var params = "";




// 循环异步方法,执行同步结果
(async function () {
const questions = ["请输入接口:", "请输入请求头:", "请输入参数:"];
for (let i = 0; i < questions.length; i++) {
const value = await question(questions[i]);
if (i === 0) {
api = value;
} else if (i === 1) {
headers = value;
} else {
params = value;
}
}
RL.close();
})()

// 监听readline关闭,结束终端输入
RL.on("close", function () {
console.log(`<<<------------------------- 开始爬取 ------------------------->>>\n`.blue);
// console.log(api, headers, params);
crawling();
})


// 封装接口请求
async function getData() {
const response = await axios({
url: api,
method: "post",
headers: {
"Content-Type": "application/json",
...JSON.parse(headers)
},
data: JSON.stringify({
...JSON.parse(params),
"pageSize": pageSize,
"pageNum": pageNum
})
})
return response.data;
}

// 爬取执行入口
async function crawling() {
const data = await getData();
console.log(data);
if (data.code !== 0) {
console.log('================= 数据读取失败 ================='.red);
process.exit(0);
}

console.log('================= 数据读取成功 ================='.green);
total = data.total;
const page = Math.ceil(total / pageSize);
console.log(`共${page}页`);
saveFile(data.rows, `第1页`);
loading();
}

// 持续执行爬取
async function loading() {
const page = Math.ceil(total / pageSize);
for (let i = 1; i < page; i++) {
pageNum++;
const data = await getData();
saveFile(data.rows, `第${i + 1}页`);
}

console.log(`<<<------------------------- 爬取完毕,已下载数据 ------------------------->>>\n`.bgGreen);
total = 0;
pageNum = 1;
process.exit(0);
}



// 下载json文件
function saveFile(res, name) {
console.log(`<<<------------------------- 开始写入 ------------------------->>>\n`.blue);
console.log(name);
const writerStream = fs.createWriteStream("data/" + name + ".json");
writerStream.write(JSON.stringify(res));
writerStream.end();

writerStream.on('finish', function () {
console.log(`<<<------------------------- 写入完成 ------------------------->>>\n`.green);
});
writerStream.on('error', function (err) {
console.log(err);
console.log(`<<<------------------------- 写入错误 ------------------------->>>\n`.red);
process.exit(0);
});
}


超简单的node爬虫小案例
https://github.com/chergn/chergn.github.io/7e15df4c659d/
作者
全易
发布于
2024年3月28日
许可协议