1 回答
TA贡献2021条经验 获得超8个赞
OP 这里,解决方案似乎是:
import re
from nltk.util import ngrams
OriginalBooksList = list()
booksAfterRemovingStopWords = list()
booksWithNGrams = list()
stopWords = ['I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'the',
'and', 'A', 'About', 'An', 'Are', 'As', 'At', 'Be', 'By', 'Com', 'For', 'From', 'How', 'In', 'Is', 'It', 'Of', 'On', 'Or', 'That', 'The', 'This', 'To', 'Was', 'The', 'And']
with open('UnifiedBookList.txt') as fin:
for line_no, line in enumerate(fin):
OriginalBooksList.append(line)
line = re.sub(r'[^\w\s]', ' ', line) # replace punctuation with space
line = re.sub(' +', ' ', line) # replace multiple space with one
line = line.lower() # to lower case
if line.strip() and len(line.split()) > 2: # line can not be empty and line must have more than 2 words
booksAfterRemovingStopWords.append(' '.join([i for i in line.split(
) if i not in stopWords])) # Remove Stop Words And Make Sentence
for line_no, line in enumerate(booksAfterRemovingStopWords):
tokens = line.split(" ")
output = list(ngrams(tokens, 3))
temp = list()
temp.append(OriginalBooksList[line_no]) # Adding original line
for x in output: # Adding n-grams
temp.append(' '.join(x))
booksWithNGrams.append(temp)
while booksWithNGrams:
first_element = booksWithNGrams.pop(0)
x = 0
for mylist in booksWithNGrams:
if set(first_element) & set(mylist):
if x == 0:
print(first_element[0])
x = 1
# print(set(first_element) & set(mylist))
print(mylist[0])
booksWithNGrams.remove(mylist)
x = 0
添加回答
举报