1 回答
TA贡献1863条经验 获得超2个赞
看看这是否满足您的需求。
from simplified_scrapy import SimplifiedDoc, utils
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
<food some_attribute="1.0">
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>
Two of our famous Belgian Waffles with plenty of real maple syrup
</description>
<calories>650</calories>
</food>
<food>
<name>Strawberry Belgian Waffles</name>
<price>$7.95</price>
<description>
Light Belgian waffles covered with strawberries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>Berry-Berry Belgian Waffles</name>
<price>$8.95</price>
<description>
Belgian waffles covered with assorted fresh berries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>French Toast</name>
<price>$4.50</price>
<description>
Thick slices made from our homemade sourdough bread
</description>
<calories>600</calories>
<some_complex_type_element_1>
<some_simple_type_element_1>Text.</some_simple_type_element_1>
</some_complex_type_element_1>
</food>
<food>
<name>Homestyle Breakfast</name>
<price>$6.95</price>
<description>
Two eggs, bacon or sausage, toast, and our ever-popular hash browns
</description>
<calories>950</calories>
<some_simple_type_element_2>Text.</some_simple_type_element_2>
</food>
</breakfast_menu>
'''
def loop(node):
para = {}
for k in node:
if k=='tag' or k=='html': continue
para[k] = ''
if para: node.setAttrs(para) # Remove attributes
children = node.children
if children:
for c in children:
loop(c)
else:
if node.text:
node.setContent('') # Remove value
doc = SimplifiedDoc(xml)
# Remove values and attributes
loop(doc.breakfast_menu)
dicNode = {}
for node in doc.breakfast_menu.children:
key = node.outerHtml
if dicNode.get(key):
node.remove() # Delete duplicate
else:
dicNode[key] = True
print(doc.html)
结果:
<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
<food some_attribute="">
<name></name>
<price></price>
<description></description>
<calories></calories>
</food>
<food>
<name></name>
<price></price>
<description></description>
<calories></calories>
</food>
<food>
<name></name>
<price></price>
<description></description>
<calories></calories>
<some_complex_type_element_1>
<some_simple_type_element_1></some_simple_type_element_1>
</some_complex_type_element_1>
</food>
<food>
<name></name>
<price></price>
<description></description>
<calories></calories>
<some_simple_type_element_2></some_simple_type_element_2>
</food>
</breakfast_menu>
对于大文件,请尝试以下方法。
from simplified_scrapy import SimplifiedDoc, utils
from simplified_scrapy.core.regex_helper import replaceReg
filePath = 'test.xml'
doc = SimplifiedDoc()
doc.loadFile(filePath, lineByline=True)
utils.appendFile('dest.xml','<?xml version="1.0" encoding="UTF-8"?><breakfast_menu>')
dicNode = {}
for node in doc.getIterable('food'):
key = node.outerHtml
key = replaceReg(key, '>[^>]*?<', '><')
key = replaceReg(key, '"[^"]*?"', '""')
if not dicNode.get(key):
dicNode[key] = True
utils.appendFile('dest.xml', key)
utils.appendFile('dest.xml', '</breakfast_menu>')
添加回答
举报