3 回答
TA贡献1725条经验 获得超7个赞
我发现了问题所在。线
start = start + 1
应该在最后一个 else 语句中的位置。
所以我的代码看起来像这样,并为我提供了上面指定的所需输入:
def tokenize(lines):
words = []
for line in lines:
start = 0
while start < len(line):
while line[start].isspace():
start = start + 1
end = start
if line[start].isalpha():
while line[end].isalpha():
end = end + 1
word = line[start:end]
word = word.lower()
words.append(word)
start = end
elif line[start].isdigit():
while line[end].isdigit():
end = end + 1
word = line[start:end]
words.append(word)
start = end
else:
word = line[start]
words.append(word)
start = start + 1
return words
但是,当我使用下面的测试脚本来确保没有遗漏函数“tokenize”的极端情况时;...
import io
import sys
import importlib.util
def test(fun,x,y):
global pass_tests, fail_tests
if type(x) == tuple:
z = fun(*x)
else:
z = fun(x)
if y == z:
pass_tests = pass_tests + 1
else:
if type(x) == tuple:
s = repr(x)
else:
s = "("+repr(x)+")"
print("Condition failed:")
print(" "+fun.__name__+s+" == "+repr(y))
print(fun.__name__+" returned/printed:")
print(str(z))
fail_tests = fail_tests + 1
def run(src_path=None):
global pass_tests, fail_tests
if src_path == None:
import wordfreq
else:
spec = importlib.util.spec_from_file_location("wordfreq", src_path+"/wordfreq.py")
wordfreq = importlib.util.module_from_spec(spec)
spec.loader.exec_module(wordfreq)
pass_tests = 0
fail_tests = 0
fun_count = 0
def printTopMost(freq,n):
saved = sys.stdout
sys.stdout = io.StringIO()
wordfreq.printTopMost(freq,n)
out = sys.stdout.getvalue()
sys.stdout = saved
return out
if hasattr(wordfreq, "tokenize"):
fun_count = fun_count + 1
test(wordfreq.tokenize, [], [])
test(wordfreq.tokenize, [""], [])
test(wordfreq.tokenize, [" "], [])
test(wordfreq.tokenize, ["This is a simple sentence"], ["this","is","a","simple","sentence"])
test(wordfreq.tokenize, ["I told you!"], ["i","told","you","!"])
test(wordfreq.tokenize, ["The 10 little chicks"], ["the","10","little","chicks"])
test(wordfreq.tokenize, ["15th anniversary"], ["15","th","anniversary"])
test(wordfreq.tokenize, ["He is in the room, she said."], ["he","is","in","the","room",",","she","said","."])
else:
print("tokenize is not implemented yet!")
if hasattr(wordfreq, "countWords"):
fun_count = fun_count + 1
test(wordfreq.countWords, ([],[]), {})
test(wordfreq.countWords, (["clean","water"],[]), {"clean":1,"water":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],[]), {"clean":1,"water":2,"is":1,"drinkable":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],["is"]), {"clean":1,"water":2,"drinkable":1})
else:
print("countWords is not implemented yet!")
if hasattr(wordfreq, "printTopMost"):
fun_count = fun_count + 1
test(printTopMost,({},10),"")
test(printTopMost,({"horror": 5, "happiness": 15},0),"")
test(printTopMost,({"C": 3, "python": 5, "haskell": 2, "java": 1},3),"python 5\nC 3\nhaskell 2\n")
else:
print("printTopMost is not implemented yet!")
print(str(pass_tests)+" out of "+str(pass_tests+fail_tests)+" passed.")
return (fun_count == 3 and fail_tests == 0)
if __name__ == "__main__":
run()
...我得到以下输出:
/usr/local/bin/python3.7 "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py"
Traceback (most recent call last):
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 81, in <module>
run()
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 50, in run
test(wordfreq.tokenize, [" "], [])
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 10, in test
z = fun(x)
File "/Users/ericjohannesson/Documents/Fristående kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/wordfreq.py", line 44, in tokenize
while line[start].isspace():
IndexError: string index out of range
为什么说字符串索引超出范围?我该如何解决这个问题?
TA贡献1847条经验 获得超11个赞
我不确定你为什么要上下做,但这是你如何拆分它的方法:
input = ['15, delicious& Tarts.']
line = input[0]
words = line.split(' ')
words = [word for word in words if word]
out:
['15,', 'delicious&', 'Tarts.']
编辑,看到你编辑了你想要的输出方式。只需跳过这一行即可获得该输出:
words = [word for word in words if word]
TA贡献1827条经验 获得超9个赞
itertools.groupby可以大大简化这一点。基本上,您根据字符的类别或类型(字母、数字或标点符号)对字符串中的字符进行分组。在此示例中,我只定义了这三个类别,但您可以根据需要定义任意数量的类别。任何不匹配任何类别的字符(本例中为空格)将被忽略:
def get_tokens(string):
from itertools import groupby
from string import ascii_lowercase, ascii_uppercase, digits, punctuation as punct
alpha = ascii_lowercase + ascii_uppercase
yield from ("".join(group) for key, group in groupby(string, key=lambda char: next((category for category in (alpha, digits, punct) if char in category), "")) if key)
print(list(get_tokens("15, delicious& Tarts.")))
输出:
['15', ',', 'delicious', '&', 'Tarts', '.']
>>>
添加回答
举报