-
#!/usr/bin/env python # encoding: utf-8 from urllib.request import urlopen req = urlopen("https://en.wikipedia.org/robots.txt") print(req.read().decode('utf-8'))
查看全部 -
#!/usr/bin/env python # encoding: utf-8 import pymysql connection = pymysql.connect(host='localhost', user='root', password='', db='wiki', charset='utf8') try: with connection.cursor() as cursor: sql = "select `urlname`, `urlhref` from `urls` where `id` is not null" count = cursor.execute(sql) print(count) #result = cursor.fetchall() #print(result) result = cursor.fetchmany(size=5) print(result) finally: connection.close()
查看全部 -
#!/usr/bin/env python # encoding: utf-8 #引入开发包 from urllib.request import urlopen from bs4 import BeautifulSoup import re import pymysql resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8") soup = BeautifulSoup(resp, "html.parser") listUrls = soup.find_all("a", href=re.compile("^/wiki/")) #print(listUrls) connection = pymysql.connect(host='localhost', user='root', password='', db='wiki', charset='utf8') print(connection) try: with connection.cursor() as cursor: for url in listUrls: if not re.search("\.(jpg|jpeg)$", url['href']): sql = "insert into `urls`(`urlname`,`urlhref`)values(%s, %s)" #print(sql) #print(url.get_text()) cursor.execute(sql, (url.get_text(), "https://en.wikipedia.org" + url["href"])) connection.commit() finally: connection.close();
查看全部 -
urllib
查看全部 -
python3 乱码解决
查看全部 -
mark
查看全部 -
导入模块
1.读取网页信息
2.对读取到的信息进行排版
3.对排版过的数据进行二次获取操作。
4.打印结果
查看全部 -
读取在线PDF查看全部
-
获取维基百科词条查看全部
-
值得一看,爬取数据查看全部
-
用urllib发送post请求;访问有的网站需要添加 origins、user agent 来表明自己不是爬虫 否则会报错查看全部
-
urllib模拟真实浏览器操作查看全部
-
检测python是否安装成功命令查看全部
-
使用decode("utf-8")可以防止乱码查看全部
-
https://en.wikipedia.org/robots.txt查看全部
举报
0/150
提交
取消