第三步采集 分页列表数据到并且入库
差不多采集了5000多条数据,速度很快
package main import ( "fmt" "database/sql" _ "github.com/go-sql-driver/mysql" "strings" "log" "github.com/PuerkitoBio/goquery" "io/ioutil" "os" "strconv" ) var ( //获取数据 topicinfo = []string{"", "", "", "", ""} ) type DbWorker struct { //mysql data source name Dsn string } func main () { //连接MySQL dbw := DbWorker{ Dsn: "root:xiaohai123@tcp(localhost:3306)/test?charset=utf8", } db, err := sql.Open("mysql", dbw.Dsn) defer db.Close() if err != nil { panic(err) } else { fmt.Println("数据库链接成功!") } dataPath := "data/city_site.txt" content, err := ioutil.ReadFile(dataPath) if err != nil { panic(err) } //读取所有的内容 str := string(content) citys := strings.Split(str, "\n") //保存所有city映射地址的连接 cityUrls := map[string]string{} for _, city := range citys { cityUrl := strings.Split(city, " ") cityUrls[cityUrl[1]] = cityUrl[2]+"/company/" } var index int = 0 var pagelasturl string = "" fd, _ := os.OpenFile("data/city_company_with_page.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) for _, cityUrl := range cityUrls { pageurl := cityUrl+"?p=" for { index++ temp := strconv.Itoa(index) pagelasturl = pageurl + temp doc, _ := goquery.NewDocument(pagelasturl) h := doc.Find(".page a").Last().Text() if h != "下一页" { index = 0 fmt.Println("采集最后一页"+pagelasturl) getpageinfo(db, pagelasturl) fd.WriteString(pagelasturl + "\n") //其他分页先入库 break } if index == 1{ fmt.Println("采集到本地"+cityUrl) getpageinfo(db, cityUrl) fd.WriteString(cityUrl + "\n") //其他分页先入库 }else{ fmt.Println("采集到本地"+pagelasturl) getpageinfo(db, pagelasturl) fd.WriteString(pagelasturl + "\n") //其他分页先入库 } temp = "" pagelasturl = "" } } } //采集入库 func getpageinfo(db *sql.DB, pagefullurl string) { doc, err := goquery.NewDocument(pagefullurl) if err != nil { log.Fatal(err) } doc.Find(".gongslist li").Each(func(i int, s *goquery.Selection) { //帖子标题 source_url,_:= s.Find(".company-link").Eq(0).Attr("href") topicinfo[0] = source_url topicinfo[1] = pagefullurl //消除各种不兼容字符 topicinfo = splitstring(topicinfo) fmt.Println(topicinfo) //数据库操作 dbmanager(db, topicinfo) }) } func splitstring(pageinfo []string) []string { spilitinfo := pageinfo for i := 0; i < 2; i++ { spilitinfo[i] = strings.Replace(pageinfo[i], "'''", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "'", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "''", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "’", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "‘", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "“", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "”", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], ",", " ", -1) spilitinfo[i] = strings.Replace(pageinfo[i], "?", " ", -1) } return spilitinfo } func dbmanager(db *sql.DB, info []string) { // fmt.Print(info[0]) var sqlinfo string = "INSERT INTO qizhuang_company (source_url,at_list_url) VALUES('" + info[0] + "','" + info[1] + "')" stmt, err := db.Prepare(sqlinfo) if err != nil { fmt.Println("insert data error: %v\n", err) return } stmt.Exec() }
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦