首页手记微信公众号批量爬取java版（二）

微信公众号批量爬取java版（二）

标签：

PHP Java

我是用java写的服务端代码。

处理公众号历史页数据方法：

public void getMsgJson(String str ,String url) throws UnsupportedEncodingException {
        // TODO Auto-generated method stub
        String biz = "";
        Map<String,String> queryStrs = HttpUrlParser.parseUrl(url);
        if(queryStrs != null){
            biz = queryStrs.get("__biz");
            biz = biz + "==";
        }
        /**
         * 从数据库中查询biz是否已经存在，如果不存在则插入，
         * 这代表着我们新添加了一个采集目标公众号。
         */
        List<WeiXin> results = weiXinMapper.selectByBiz(biz);
        if(results == null || results.size() == 0){
            WeiXin weiXin = new WeiXin();
            weiXin.setBiz(biz);
            weiXin.setCollect(System.currentTimeMillis());
            weiXinMapper.insert(weiXin);
        }
        //System.out.println(str);
        //解析str变量
        List<Object> lists = JsonPath.read(str, "['list']");
        for(Object list : lists){
            Object json = list;
            int type = JsonPath.read(json, "['comm_msg_info']['type']");
            if(type == 49){//type=49表示是图文消息
                String content_url = JsonPath.read(json, "$.app_msg_ext_info.content_url");
                content_url = content_url.replace("\\", "").replaceAll("amp;", "");//获得图文消息的链接地址
                int is_multi = JsonPath.read(json, "$.app_msg_ext_info.is_multi");//是否是多图文消息
                Integer datetime = JsonPath.read(json, "$.comm_msg_info.datetime");//图文消息发送时间
                /**
                 * 在这里将图文消息链接地址插入到采集队列库tmplist中
                 * （队列库将在后文介绍，主要目的是建立一个批量采集队列，
                 * 另一个程序将根据队列安排下一个采集的公众号或者文章内容）
                 */
                try{
                    if(content_url != null && !"".equals(content_url)){
                        TmpList tmpList = new TmpList();
                        tmpList.setContentUrl(content_url);
                        tmpListMapper.insertSelective(tmpList);
                    }
                }catch(Exception e){
                    System.out.println("队列已存在,不插入！");
                }

                /**
                 * 在这里根据$content_url从数据库post中判断一下是否重复
                 */
                List<Post> postList = postMapper.selectByContentUrl(content_url);
                boolean contentUrlExist = false;
                if(postList != null && postList.size() != 0){
                    contentUrlExist = true;
                }

                if(!contentUrlExist){//'数据库post中不存在相同的$content_url'
                    Integer fileid = JsonPath.read(json, "$.app_msg_ext_info.fileid");//一个微信给的id
                    String title = JsonPath.read(json, "$.app_msg_ext_info.title");//文章标题
                    String title_encode = URLEncoder.encode(title, "utf-8");
                    String digest = JsonPath.read(json, "$.app_msg_ext_info.digest");//文章摘要
                    String source_url = JsonPath.read(json, "$.app_msg_ext_info.source_url");//阅读原文的链接
                    source_url = source_url.replace("\\", "");
                    String cover = JsonPath.read(json, "$.app_msg_ext_info.cover");//封面图片
                    cover = cover.replace("\\", "");
                    /**
                     * 存入数据库
                     */
//                    System.out.println("头条标题："+title);
//                    System.out.println("微信ID："+fileid);
//                    System.out.println("文章摘要:"+digest);
//                    System.out.println("阅读原文链接:"+source_url);
//                    System.out.println("封面图片地址:"+cover);                    

                    Post post = new Post();
                    post.setBiz(biz);
                    post.setTitle(title);
                    post.setTitleEncode(title_encode);
                    post.setFieldId(fileid);
                    post.setDigest(digest);
                    post.setSourceUrl(source_url);
                    post.setCover(cover);
                    post.setIsTop(1);//标记一下是头条内容
                    post.setIsMulti(is_multi);
                    post.setDatetime(datetime);
                    post.setContentUrl(content_url);

                    postMapper.insert(post);
                }

                if(is_multi == 1){//如果是多图文消息
                    List<Object> multiLists = JsonPath.read(json, "['app_msg_ext_info']['multi_app_msg_item_list']");
                    for(Object multiList : multiLists){
                        Object multiJson = multiList;                    
                        content_url = JsonPath.read(multiJson, "['content_url']").toString().replace("\\", "").replaceAll("amp;", "");//图文消息链接地址
                        /**
                         * 这里再次根据$content_url判断一下数据库中是否重复以免出错
                         */
                        contentUrlExist = false;
                        List<Post> posts = postMapper.selectByContentUrl(content_url);
                        if(posts != null && posts.size() != 0){
                            contentUrlExist = true;
                        }
                        if(!contentUrlExist){//'数据库中不存在相同的$content_url'
                            /**
                             * 在这里将图文消息链接地址插入到采集队列库中
                             * （队列库将在后文介绍，主要目的是建立一个批量采集队列，
                             * 另一个程序将根据队列安排下一个采集的公众号或者文章内容）
                             */
                            if(content_url != null && !"".equals(content_url)){
                                TmpList tmpListT = new TmpList();
                                tmpListT.setContentUrl(content_url);
                                tmpListMapper.insertSelective(tmpListT);
                            }

                            String title = JsonPath.read(multiJson, "$.title");
                            String title_encode = URLEncoder.encode(title, "utf-8");
                            Integer fileid = JsonPath.read(multiJson, "$.fileid");
                            String digest = JsonPath.read(multiJson, "$.digest");
                            String source_url = JsonPath.read(multiJson, "$.source_url");
                            source_url = source_url.replace("\\", "");
                            String cover = JsonPath.read(multiJson, "$.cover");
                            cover = cover.replace("\\", "");                        
//                            System.out.println("标题:"+title);
//                            System.out.println("微信ID:"+fileid);
//                            System.out.println("文章摘要:"+digest);
//                            System.out.println("阅读原文链接:"+source_url);
//                            System.out.println("封面图片地址:"+cover);                            
                            Post post = new Post();
                            post.setBiz(biz);
                            post.setTitle(title);
                            post.setTitleEncode(title_encode);
                            post.setFieldId(fileid);
                            post.setDigest(digest);
                            post.setSourceUrl(source_url);
                            post.setCover(cover);
                            post.setIsTop(0);//标记一下不是头条内容
                            post.setIsMulti(is_multi);
                            post.setDatetime(datetime);
                            post.setContentUrl(content_url);

                            postMapper.insert(post);

                        }
                    }
                }            
            }        
        }
    }

处理公众号文章页的方法：

public String getWxPost() {
        // TODO Auto-generated method stub
        /**
         * 当前页面为公众号文章页面时，读取这个程序
         * 首先删除采集队列表中load=1的行
         * 然后从队列表中按照“order by id asc”选择多行(注意这一行和上面的程序不一样)
         */
        tmpListMapper.deleteByLoad(1);
        List<TmpList> queues = tmpListMapper.selectMany(5);
        String url = "";
        if(queues != null && queues.size() != 0 && queues.size() > 1){
            TmpList queue = queues.get(0);
            url = queue.getContentUrl();
            queue.setIsload(1);
            int result = tmpListMapper.updateByPrimaryKey(queue);
            System.out.println("update result:"+result);
        }else{
            System.out.println("getpost queues is null?"+queues==null?null:queues.size());
            WeiXin weiXin = weiXinMapper.selectOne();
            String biz = weiXin.getBiz();
            if((Math.random()>0.5?1:0) == 1){
                url = "http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=" + biz + 
                        "#wechat_webview_type=1&wechat_redirect";//拼接公众号历史消息url地址（第一种页面形式）
            }else{
                url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz + 
                        "#wechat_redirect";//拼接公众号历史消息url地址（第二种页面形式）
            }
            url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz + 
                    "#wechat_redirect";//拼接公众号历史消息url地址（第二种页面形式）
            //更新刚才提到的公众号表中的采集时间time字段为当前时间戳。
            weiXin.setCollect(System.currentTimeMillis());
            int result = weiXinMapper.updateByPrimaryKey(weiXin);
            System.out.println("getPost weiXin updateResult:"+result);
        }
        int randomTime = new Random().nextInt(3) + 3;
        String jsCode = "<script>setTimeout(function(){window.location.href='"+url+"';},"+randomTime*1000+");</script>";
        return jsCode;

    }

处理公众号点赞量和阅读量的方法：

public void getMsgExt(String str,String url) {
        // TODO Auto-generated method stub
        String biz = "";
        String sn = "";
        Map<String,String> queryStrs = HttpUrlParser.parseUrl(url);
        if(queryStrs != null){
            biz = queryStrs.get("__biz");
            biz = biz + "==";
            sn = queryStrs.get("sn");
            sn = "%" + sn + "%";
        }
        /**
         * $sql = "select * from `文章表` where `biz`='".$biz."'
         * and `content_url` like '%".$sn."%'" limit 0,1;
         * 根据biz和sn找到对应的文章
         */
        Post post = postMapper.selectByBizAndSn(biz, sn);

        if(post == null){
            System.out.println("biz:"+biz);
            System.out.println("sn:"+sn);
            tmpListMapper.deleteByLoad(1);
            return;
        }

//        System.out.println("json数据:"+str);
        Integer read_num;
        Integer like_num;
        try{
            read_num = JsonPath.read(str, "['appmsgstat']['read_num']");//阅读量
            like_num  = JsonPath.read(str, "['appmsgstat']['like_num']");//点赞量
        }catch(Exception e){
            read_num = 123;//阅读量
            like_num  = 321;//点赞量
            System.out.println("read_num:"+read_num);
            System.out.println("like_num:"+like_num);
            System.out.println(e.getMessage());
        }        

        /**
         * 在这里同样根据sn在采集队列表中删除对应的文章，代表这篇文章可以移出采集队列了
         * $sql = "delete from `队列表` where `content_url` like '%".$sn."%'" 
         */
        tmpListMapper.deleteBySn(sn);

        //然后将阅读量和点赞量更新到文章表中。
        post.setReadnum(read_num);
        post.setLikenum(like_num);
        postMapper.updateByPrimaryKey(post);

    }

处理跳转向微信注入js的方法：

public String getWxHis() {
        String url = "";
        // TODO Auto-generated method stub
        /**
         * 当前页面为公众号历史消息时，读取这个程序
         * 在采集队列表中有一个load字段，当值等于1时代表正在被读取
         * 首先删除采集队列表中load=1的行
         * 然后从队列表中任意select一行
         */
        tmpListMapper.deleteByLoad(1);
        TmpList queue = tmpListMapper.selectRandomOne();
        System.out.println("queue is null?"+queue);
        if(queue == null){//队列表为空
            /**
             * 队列表如果空了，就从存储公众号biz的表中取得一个biz，
             * 这里我在公众号表中设置了一个采集时间的time字段，按照正序排列之后，
             * 就得到时间戳最小的一个公众号记录，并取得它的biz
             */
            WeiXin weiXin = weiXinMapper.selectOne();

            String biz = weiXin.getBiz();
            url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz + 
                    "#wechat_redirect";//拼接公众号历史消息url地址（第二种页面形式）
            //更新刚才提到的公众号表中的采集时间time字段为当前时间戳。
            weiXin.setCollect(System.currentTimeMillis());
            int result = weiXinMapper.updateByPrimaryKey(weiXin);
            System.out.println("getHis weiXin updateResult:"+result);
        }else{
            //取得当前这一行的content_url字段
            url = queue.getContentUrl();
            //将load字段update为1
            tmpListMapper.updateByContentUrl(url);
        }
        //将下一个将要跳转的$url变成js脚本，由anyproxy注入到微信页面中。
        //echo "<script>setTimeout(function(){window.location.href='".$url."';},2000);</script>";
        int randomTime = new Random().nextInt(3) + 3;
        String jsCode = "<script>setTimeout(function(){window.location.href='"+url+"';},"+randomTime*1000+");</script>";
        return jsCode;
    }

以上就是对处理代理服务器拦截到的数据进行处理的程序。这里有一个需要注意的问题，程序会对数据库中的每个收录的公众号进行轮循访问，甚至是已经存储的文章也会再次访问，目的是为了一直更新文章的阅读数和点赞数。如果需要抓取大量的公众号建议对添加任务队列的代码进行修改，添加条件限制，否则公众号一多轮循抓取重复数据将十分影响效率。

至此就将微信公众号的文章链接全部爬取到，而且这个链接是永久有效而且可以在浏览器打开的链接，接下来就是写爬虫程序从数据库中拿链接爬取文章内容等信息了。

我是用webmagic写的爬虫，轻量好用。

public class SpiderModel implements PageProcessor{

    private static PostMapper postMapper;

    private static List<Post> posts;

    // 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public Site getSite() {
        // TODO Auto-generated method stub
        return this.site;
    }

    public void process(Page page) {
        // TODO Auto-generated method stub
        Post post = posts.remove(0);
        String content = page.getHtml().xpath("//div[@id='js_content']").get();
        //存在和谐文章 此处做判定如果有直接删除记录或设置表示位表示文章被和谐
        if(content == null){
            System.out.println("文章已和谐！");
            //postMapper.deleteByPrimaryKey(post.getId());
            return;
        }
        String contentSnap = content.replaceAll("data-src", "src").replaceAll("preview.html", "player.html");//快照
        String contentTxt = HtmlToWord.stripHtml(content);//纯文本内容

        Selectable metaContent = page.getHtml().xpath("//div[@id='meta_content']");
        String pubTime = null;
        String wxname = null;
        String author = null;
        if(metaContent != null){
            pubTime = metaContent.xpath("//em[@id='post-date']").get();
            if(pubTime != null){
                pubTime = HtmlToWord.stripHtml(pubTime);//文章发布时间
            }
            wxname = metaContent.xpath("//a[@id='post-user']").get();
            if(wxname != null){
                wxname = HtmlToWord.stripHtml(wxname);//公众号名称
            }
            author = metaContent.xpath("//em[@class='rich_media_meta rich_media_meta_text' and @id!='post-date']").get();
            if(author != null){
                author = HtmlToWord.stripHtml(author);//文章作者
            }
        }

//        System.out.println("发布时间:"+pubTime);
//        System.out.println("公众号名称:"+wxname);
//        System.out.println("文章作者:"+author);

        String title = post.getTitle().replaceAll("&nbsp;", "");//文章标题
        String digest = post.getDigest();//文章摘要
        int likeNum = post.getLikenum();//文章点赞数
        int readNum = post.getReadnum();//文章阅读数
        String contentUrl = post.getContentUrl();//文章链接

        WechatInfoBean wechatBean = new WechatInfoBean();
        wechatBean.setTitle(title);
        wechatBean.setContent(contentTxt);//纯文本内容
        wechatBean.setSourceCode(contentSnap);//快照
        wechatBean.setLikeCount(likeNum);
        wechatBean.setViewCount(readNum);
        wechatBean.setAbstractText(digest);//摘要
        wechatBean.setUrl(contentUrl);
        wechatBean.setPublishTime(pubTime);
        wechatBean.setSiteName(wxname);//站点名称 公众号名称
        wechatBean.setAuthor(author);
        wechatBean.setMediaType("微信公众号");//来源媒体类型

        WechatStorage.saveWechatInfo(wechatBean);

        //标示文章已经被爬取
        post.setIsSpider(1);
        postMapper.updateByPrimaryKey(post);

    }    

    public static void startSpider(List<Post> inposts,PostMapper myPostMapper,String... urls){

        long startTime, endTime;
        startTime = System.currentTimeMillis();
        postMapper = myPostMapper;
        posts = inposts;

        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();        
        SpiderModel spiderModel = new SpiderModel();
        Spider mySpider = Spider.create(spiderModel).addUrl(urls);
        mySpider.setDownloader(httpClientDownloader);
        try {
            SpiderMonitor.instance().register(mySpider);
            mySpider.thread(1).run();
        } catch (JMException e) {
            e.printStackTrace();
        }

        endTime = System.currentTimeMillis();
        System.out.println("爬取时间" + ((endTime - startTime) / 1000) + "秒--");

    }

}

其它的一些无关逻辑的存储数据代码就不贴了，这里我把代理服务器抓取到的数据存在了mysql，把自己的爬虫程序爬到的数据存储在了mongodb。

下面是自己爬取到的公众号号的信息：