课程
                    
                        /后端开发
                        
                            /Python
                        
                        /Python数据预处理（二）- 清洗文本数据

源码能分享下吗

冰忆1996

2019-11-13

源自：Python数据预处理（二）- 清洗文本数据 1-7

关注问题我要回答

1035

操作

收起

2 回答

higandawn
2020-02-09

"""
Description:正则清洗HTML数据
Author:
Prompt: code in python3 env
"""
"""
   re.I   使匹配对大小写不敏感
   re.L   做本地化识别（locale-aware）匹配
   re.M   多行匹配，影响^(开头)和$(结尾)
   re.S   匹配包含换行在内的所有字符
   re.U   根据Unicode字符集解析字符，这个标志影响 \w, \W, \b, \B
   re.X   该标志通过给予你更灵活的格式以便你将正则表达式写得更加
"""
import re

# 处理HTML标签文本
# @param htmlstr html字符串


def filter_tags(htmlstr):
   # 过滤doc_type
   htmlstr = ' '.join(htmlstr.split())
   re_doctype = re.compile(r'<!DOCTYPE .*?>', re.S)
   res = re_doctype.sub('', htmlstr)

   # 过滤CDATA
   re_cdata = re.compile( r'//<!CDATA\[[ >] //\] >', re.I)
   res = re_cdata.sub('', res)

   # Script
   re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
   res = re_script.sub('', res)

   # 注释
   re_script = re.compile('<!--.*?-->', 0)
   res = re_script.sub('', res)

   # 换行符
   re_br = re.compile('<br\n*?/?>')
   res = re_br.sub('\n', res)

   # HTML 标签
   re_lable = re.compile('</?\w[^>]*>')
   res = re_lable.sub('', res)

   # 转义字符
   re_esc = re.compile('&.*?;')
   res = re_esc.sub('', res)

   # 空格处理
   re_blank = re.compile('\s+') # \s包含 \t \n \r \f \v
   res = re_blank.sub(' ', res)

   # 超链接处理
   re_http = re.compile(r'(http://.+.html)')
   res = re_http.sub(' ', res)

   d = lambda pattern, flags=0: re.compile(pattern, flags)
   for re_type in re_mate:
      re_type = d(*re_type)
      res = re_type.sub(' ', res)
   return res

def read_file(read_path):
   str_doc = ''
   with open(read_path, 'r', encoding='utf-8') as f:
      str_doc = f.read()
   return str_doc


if __name__ == '__main__':
   str_doc = read_file(r'../data/html/re.html')
   res = filter_tags(str_doc)
   # print(res)


   with open(r'../data/html/test.html', 'w', encoding='utf-8') as f:
      f.write(res)
   print('No Exception') # 我是通过另一个编辑器进行打开预览的

这是我的笔记