这个程序是抽取豆瓣top250页面所有电影相关信息(名称,分数,影评人数,引用语)。问题是在parse_page函数中,top250共十个页面,成功提取前八页的信息,但最后两页的信息提取有问题提示listindexoutofrange,但此数据在for中有显示,for之外调用时就出错。求解。importsocketimportssldeflog(*args,**kwargs):print('log:',*args,**kwargs)defparse_url(url):#提取协议与uriprotocol=url.split('://')[0]ifprotocol=='http':protocol='http'uri=url.split('://')[1]elifprotocol=='https':protocol='https'uri=url.split('://')[1]else:uri=url#提取主机地址index=uri.find('/')ifindex==-1:host=urielse:host=uri.split('/')[0]#提取端口号http_ports={'http':80,'https':443,}ifprotocolinhttp_ports:port=http_ports[protocol]else:port=uri.split(':')[1]#提取路径ifindex==-1:path='/'else:path='/'+uri.split('/')[1]returnprotocol,host,port,pathdefsocket_by_protocol(protocol):ifprotocol=='http':s=socket.socket()elifprotocol=='https':s=ssl.wrap_socket(socket.socket())returnsdefresponse_by_socket(s):buffer_size=1024all_data=b''whileTrue:response=s.recv(buffer_size)iflen(response)==0:breakall_data+=responsereturnall_data.decode()defparse_response(response):errors=''ifresponse:header,body=response.split('\r\n\r\n',1)header_line=header.split('\r\n')status_code=header_line[0].split()[1]headers={}forlineinheader_line[1:]:k,v=line.split(':')headers[k]=velse:errors='responseisnullvalue.'headers={}body=''returnstatus_code,headers,bodydefconstruct_request(host,path):request='GET{}HTTP/1.1\r\nhost:{}\r\nconnection:close\r\n\r\n'.format(path,host)returnrequest.encode()defget(url,query):protocol,host,port,path=parse_url(url)s=socket_by_protocol(protocol)s.connect((host,port))cons_path='{}?{}={}'.format(path,query[1],query[0])request=construct_request(host,cons_path)s.send(request)response=response_by_socket(s)status_code,header,body=parse_response(response)returnstatus_code,header,bodydefparse_page(source=''):mv_name=[]mv_score=[]mv_people=[]mv_quot=[]first_split=str(source.split('').pop(1))second_split=str(first_split.split('').pop(0))third_split=second_split.split('')delthird_split[0]forlineinthird_split:line=line.split('')delline[1]#名称抽取raw_single_mv_name=line[0].split('')[0].split('')[1]single_mv_name=raw_single_mv_name.split('')[0]mv_name.append(single_mv_name)#分数与评价人数抽取raw_single_mv_evaluate=line[0].split('')[1].split('')single_mv_score=raw_single_mv_evaluate[1].split('">')[1]mv_score.append(single_mv_score)single_mv_people=raw_single_mv_evaluate[3].split('')[1]mv_people.append(single_mv_people)#引用语抽取#log(mv_name,mv_score,mv_people,line[0])#log(line[0].split('')[1])raw_singe_mv_quot=line[0].split('')[1]#log(raw_singe_mv_quot)single_mv_quot=raw_singe_mv_quot.split('')[0]#log(single_mv_quot)mv_quot.append(single_mv_quot)#此处mv_quot有值log(mv_quot)#为何这里mv_quot提示listindexoutofrangelog(mv_quot)#log(len(mv_name),len(mv_score),len(mv_people),len(mv_quot))returnmv_name,mv_score,mv_people,mv_quotdefmain():url="https://movie.douban.com/top250"protocol,host,port,path=parse_url(url)log(protocol,host,port,path)queries={}forvin[valueforvalueinrange(250,0,-25)]:queries[v]='start'log(queries)i=0forqinqueries.items():try:status_code,header,body=get(url,q)"""ifi==8:log(status_code,header,body)"""mvo_name,mvo_score,mvo_people,mvo_quot=parse_page(source=body)#log(mvo_name)#log(mvo_score)#log(mvo_people)log(mvo_quot)i+=1exceptExceptionase:log(e)continueif__name__=='__main__':main()
2 回答
四季花海
TA贡献1811条经验 获得超5个赞
这行代码有问题raw_singe_mv_quot=line[0].split('')[1] 拆开解释tmp_list=line[0].split('') raw_singe_mv_quot=tmp_list[1]tmp_list这个列表的长度可能为1,所以tmp_list[1]会报错误。具体逻辑我也没看,你自己排查吧!
添加回答
举报
0/150
提交
取消