您好,我正在尝试使用 Python 进行词干分析,我想使用 Regex 模块删除或更改 URL (https/http)。我已经写了一些代码行,但它似乎只适用于标点符号而不适用于表情符号和 URL,有人请帮助我吗?这是我的代码 def tokenWordbase(verse): return verse.split(' ')#URLsdef url(link): link = re.sub(r'^https?:\/\/.*[\r\n]*', '', link, flags=re.MULTILINE)#punctuationdef punctuation(tokens): tokens = re.sub(r'[>)}:{",?+ !.(<;1234567890]','',str(tokens)) tokens = re.sub('\n','',str(tokens)) return tokens#emotdef emoticons(emot): emot = re.sub( ':-)', ':)', '(:', '(-:', ':-D', ':D', 'X-D', 'XD', 'xD', ';-)', ';)', ';-D', ';D', '(;', '(-;', ':-(', ':(', '(:', '(-:', ':,(', ':\'(', ':"(', ':((', str(tokens)) return emotdef main(): currentString = [] panjangTang = [] with open('Book2.csv') as f: reader = csv.reader(f, delimiter=',') for row in reader: if row[0] == '' or row[0] == '-': continue else: tokennya = tokenWordbase(row[0]) panjangTangSementara = [] for j in range(len(tokennya)): # print(childToken) puntu = punctuation(tokennya[j]) stopnya = stopword.remove(puntu) if stopnya == '': continue else: stemmnya = stemmer.stem(stopnya) currentString.append(stemmnya) panjangTangSementara.append(stemmnya) panjangTang.append(panjangTangSementara) daftarDokumen = Counter(currentString) daftarString = []
2 回答

慕斯王
TA贡献1864条经验 获得超2个赞
domain_registers = [".com",".net",".info",".biz","mobi",".xxx",".asia",".eu",".name",".us ",".co",".org",".me",".mx",".in",".ac",".asp",".php",".html"]
def url(link): if ('.' in link and '/' in link) or 'www.' in link 或 'http' in link 或 any([word in link for word in domain_registers]):返回“”
添加回答
举报
0/150
提交
取消