2 回答
TA贡献1796条经验 获得超4个赞
您可以执行与以前相同的程序。
唯一需要注意的是,您必须在换班前使用 not (~) 运算符。原因是这种转变将在您的系列的第一个位置创建一个 np.nan ,它将系列定义为浮点数,从而在 not 操作上失败。
import pandas as pd
import numpy as np
table = pd.Series(
["<td class='test'>AA</td>", # 0
"<td class='test'>A</td>", # 1
"<td class='test'><a class='test' href=...", # 2
"<td class='test'>B</td>", # 3
"<td class='test'><a class='test' href=...", # 4
"<td class='test'>BB</td>", # 5
"<td class='test'>C</td>", # 6
"<td class='test'><a class='test' href=...", # 7
"<td class='test'>F</td>", # 8
"<td class='test'>G</td>", # 9
"<td class='test'><a class='test' href=...", # 10
"<td class='test'>X</td>"]) # 11
not_contain = ~table.str.contains('href')
cond = not_contain & not_contain.shift(1)
array = np.insert(table.values, cond[cond].index, "None")
pd.Series(array)
TA贡献1873条经验 获得超9个赞
这解决了上述问题,但没有 Numpy 和 Pandas。如果你能用他们重新创造,我会给你正确的答案。
import pandas as pd
import numpy as np
table = pd.Series(
["<td class='test'>AA</td>", # 0
"<td class='test'>A</td>", # 1
"<td class='test'><a class='test' href=...", # 2
"<td class='test'>B</td>", # 3
"<td class='test'><a class='test' href=...", # 4
"<td class='test'>BB</td>", # 5
"<td class='test'>C</td>", # 6
"<td class='test'><a class='test' href=...", # 7
"<td class='test'>F</td>", # 8
"<td class='test'>G</td>", # 9
"<td class='test'><a class='test' href=...", # 10
"<td class='test'>X</td>"]) # 11
insertAt = []
for i in range(0, len(table)-1):
# print('count ', i)
if i == 1:
if 'href' not in table[0] and 'href' not in table[1]:
print(i, ' starts with tag')
print(i, ' is duplicated')
insertAt.append(True)
insertAt.append(True)
next
elif 'href' not in table[0] and 'href' in table[1]:
print(i, ' not start with tag')
print(i, ' is not duplicated')
insertAt.append(True)
insertAt.append(False)
next
else:
print(i, ' not start with tag')
print(i, ' is not duplicated')
insertAt.append(False)
insertAt.append(False)
next
if i > 1:
if 'href' not in table[i-1] and 'href' not in table[i]:
print(i + 1, ' is duplicated')
insertAt.append(True)
else:
print(i + 1, ' is not duplicated')
insertAt.append(False)
insertAt = pd.Series(insertAt)
array = np.insert(table.values, insertAt[insertAt].index, "None")
pd.Series(array) # back to series if necessary
添加回答
举报