除了学习人数获取不了,其他都没啥问题
var http = require('http')
var Promise = require('bluebird')
var cheerio = require('cheerio')
var baseUrl = 'http://www.imooc.com/learn/'
var videoIds = [935, 796, 694, 327]
function filterChapters(html){
var $ = cheerio.load(html)
var chapters = $('.chapter')
var title = $('.course-infos .path span').text().trim()
var number = ($('.js-learn-num').text() != '')?parseInt($('.js-learn-num').text().trim(),10):0
var courseData = {
title: title,
number: number,
videos: []
}
chapters.each(function(item){
var chapter = $(this)
var chapterTitle = chapter.find('strong').contents().filter(function() {
return this.nodeType === 3;
}).text().trim()
chapterTitle = chapterTitle.replace(/<\/?[^>]*>/g,''); //去除HTML tag
chapterTitle = chapterTitle.replace(/[ | ]*\n/g,'\n'); //去除行尾空白
var videos = chapter.find('.video').children('li')
var chapterData = {
chapterTitle: chapterTitle,
videos: []
}
videos.each(function(item){
var video = $(this).find('.J-media-item')
var videoTitle = video.contents().filter(function() {
return this.nodeType === 3;
}).text().trim()
videoTitle = videoTitle.replace(/<\/?[^>]*>/g,''); //去除HTML tag
videoTitle = videoTitle.replace(/[ | ]*\n/g,'\n'); //去除行尾空白
var id = video.attr('href').split('video/')[1]
chapterData.videos.push({
title: videoTitle,
id: id
})
})
courseData.videos.push(chapterData)
})
return courseData
}
function printCourseInfo(courseData){
courseData.forEach(function(courseData){
console.log(courseData.number + ' 人学过 '+ courseData.title + '\n')
})
courseData.forEach(function(courseData){
console.log('### '+ courseData.title + '\n')
courseData.videos.forEach(function(item){
var chapterTitle = item.chapterTitle
console.log(chapterTitle + '\n')
item.videos.forEach(function(video){
console.log(' [' + video.id + '] ' + video.title + '\n')
})
})
})
}
function getPageAsync(url){
return new Promise(function(resolve, reject){
console.log('正在爬取 ' + url)
http.get(url, function(res){
var html = ''
res.on('data', function(data){
html += data
})
res.on('end', function(){
resolve(html)
})
}).on('error', function(e){
reject(e)
console.log('获取课程数据出错!')
})
})
}
var fetchCourseArray = []
videoIds.forEach(function(id){
fetchCourseArray.push(getPageAsync(baseUrl + id))
})
Promise
.all(fetchCourseArray)
.then(function(pages){
var courseData = []
pages.forEach(function(html){
var courses = filterChapters(html)
courseData.push(courses)
})
courseData.sort(function(a,b){
return a.number < b.number
})
printCourseInfo(courseData)
})