var http = require('http');
var Promise = require('bluebird');
var cheerio = require('cheerio');
var colors = require('colors');
var baseUrl = 'http://www.imooc.com/learn/';
var videoIds = [348, 259, 197, 134, 75];
var url = 'http://www.imooc.com/learn/348';
console.log('正在启动程序');
console.log('.')
console.log('..')
console.log('.........')
function removeSpecialCharscter(words) {
var reg = /\s+/g;
return words.replace(reg, ' ');
}
function filterHtml(obj) {
var $ = cheerio.load(obj.html);
var courseId = obj.id;
var courseTitle = $('#main .course-infos h2').text().trim();
var chapters = $('.chapter');
/*[
chapterTitle: '',
sessions:{
chapterTitle: '',
session: [{
id: '',
title: ''
}]
}]*/
var courseData = [];
//获取章
chapters.each(function(item) {
var chapter = $(this);
var $chapterTitle = chapter.find('strong').clone();
$chapterTitle.find('.chapter-info').remove();
//章标题
var chapterTitle = $chapterTitle.text();
chapterTitle = removeSpecialCharscter(chapterTitle.trim());
var chapterData = {
chapterTitle: chapterTitle,
sessions: []
};
//获取节
var sessions = chapter.find('.video li');
sessions.each(function(item) {
var session = $(this);
var $a = session.find('a');
var sessionTitle = removeSpecialCharscter($a.text().trim());
var splitSessionTitle = sessionTitle.split('(');
sessionTitle = splitSessionTitle[0];
var id = $a.attr('href').split('/video/')[1];
chapterData.sessions.push({
id: id,
title: sessionTitle
});
})
courseData.push(chapterData);
});
courseData = {
id: courseId,
title: courseTitle,
chapterData: courseData
};
return courseData;
}
function printCourseInfo(courseData) {
var courseId = courseData.id;
var courseTitle = courseData.title;
console.log(('\n《' + courseTitle + '》 编号:' + courseId).green);
var courses = courseData['chapterData'];
courses.forEach(function(item) {
var chapterTitle = item.chapterTitle;
//输出章
console.log(chapterTitle.red);
var sessions = item.sessions;
//输出节
sessions.forEach(function(item) {
console.log(' ' + item.title.yellow);
})
});
}
function getPageAsync(url, id) {
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
console.log('正在爬取:'+url)
var html = '';
res.on('data', function(data) {
html += data;
});
res.on('end', function() {
resolve({
html: html,
id: id
});
/*var courseData = filterHtml(html);
pringCourseInfo(courseData);*/
})
}).on('error', function(e) {
reject(e)
console.log('获取网页代码出错!');
})
})
}
var fetchCourseArray = [];
videoIds.forEach(function(id) {
fetchCourseArray.push(getPageAsync(baseUrl + id, id))
})
Promise
.all(fetchCourseArray)
.then(function(obj) {
setTimeout(function() {
var courseData = [];
obj.forEach(function(item) {
var course = filterHtml(item);
courseData.push(course);
})
courseData.sort(function(a, b) {
return a.id > b.id;
})
.forEach(function(course) {
printCourseInfo(course);
})
// console.log('成功爬取页面!');
}, 1000)
})