Promise重构http爬虫代码
var http = require('http');
var cheerio = require('./node_modules/cheerio');
var url = 'http://www.imooc.com/learn/348';
var baseUrl = {
"htmlUrl": 'http://www.imooc.com/learn/',
"numberUrl": 'http://www.imooc.com/course/AjaxCourseMembers?ids='
};
var videoIds = ['728', '637', '348', '259', '197', '134', '75'];
/**
*
* 将爬取到的网页内容进行过滤调整
* @param {string} html
* @returns {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} 返回过滤到的对象
*/
function filterChapters(html) {
// cheerio加载html
var $ = cheerio.load(html);
var chapters = $('.chapter');
var coursesData = {
title: $('.hd h2').text().trim(),
number: '',
chapters: []
}
var chapter, Title, videos, chapterData;
var videos, videoTitle, id;
chapters.each(function (value) {
chapter = $(this);
// 过滤不提取子类中的text
Title = chapter.find('strong').contents().filter(function () {
return this.nodeType == 3;
}).text().trim();
chapterData = {
"chapterTitle": Title,
"videos": []
}
videos = chapter.find('.video').children('li');
videos.each(function (value) {
video = $(this).find('.J-media-item');
// 这个title包含了video的title和这个video的时间,两者用换行符分割
videoTitles = video.contents().filter(function () {
return this.nodeType == 3;
}).text().trim().split('\n');
id = video.attr('href').split('video/')[1];
chapterData.videos.push({
"title": videoTitles[0].trim(),
"time": videoTitles[1].trim(),
"id": id
});
});
coursesData.chapters.push(chapterData);
});
return coursesData;
}
/**
* 打印课程信息
* @param {{chapterTitle:string,videos:[{title:string,time:string,id:string}]}} courseData 课程信息
*/
function printCoursrInfo(coursesData) {
var courseMessage = '';
var chapter;
coursesData.forEach(function (course, index) {
courseMessage += course.title + ' 学习人数:' + course.number + '\n';
chapters = course.chapters;
chapters.forEach(function (chapter, index) {
courseMessage += '\n' + chapter.chapterTitle + '\n';
chapter.videos.forEach(function (video, index) {
courseMessage += '[' + video.id + '] ' + video.title + ' time:' + video.time + '\n';
});
});
courseMessage += '\n\n';
});
console.log(courseMessage);
}
/**
* 同步爬取多个网页内容
*
* @param {string} url
* @returns
*/
function getPageAsync(url) {
return new Promise(function (resolve, reject) {
console.log('正在爬取网页的内容: ' + url.htmlUrl + '\n');
var html = '';
var number = 0;
http.get(url.htmlUrl, function (res) {
res.on('data', function (data) {
html += data;
});
res.on('end', function () {
console.log('html获取完毕,开始获取学习人数......')
// 获取完html以后,继续获取学习人数
http.get(url.numberUrl, function (res) {
var resData = '';
res.on('data', function (data) {
resData += data;
});
res.on('end', function (res) {
console.log('获取学习人数成功')
number = JSON.parse(resData).data[0].numbers;
resolve({
"html": html,
"number": number
});
});
}).on('error', function (e) {
console.log('获取人数失败: ' + e.message);
reject(e);
})
});
}).on('error', function (e) {
console.log('获取html失败: ' + e.message);
reject(e);
});
});
}
var fetchCourseArray = [];
videoIds.forEach(function (id) {
fetchCourseArray.push(getPageAsync({
"htmlUrl": baseUrl.htmlUrl + id,
"numberUrl": baseUrl.numberUrl + id
}));
});
Promise
.all(fetchCourseArray)
.then(function (page) {
var coursesData = [];
page.forEach(function (content) {
var course = filterChapters(content.html);
course.number = parseInt(content.number);
coursesData.push(course);
});
coursesData.sort(function (a, b) {
return a.number < b.number;
});
printCoursrInfo(coursesData);
})
点击查看更多内容
2人点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦