def _get_new_data(self, page_url, soup):
res_data = {}
# Url
res_data['url'] = page_url
print 'now page_url add in res_data !'
'''
<div class="antialiased sans-serif text-lg _2c text-center bold truncate">All Pins</div> (Pinterest)
<img id="i-f88v" src="https://s-media-cache-ak0.pinimg.com/236x/0a/ad/cc/0aadcce6c2daba0e0869e6fc6ee9649d.jpg"
class="pinImg fullBleed loaded" alt="short curly bob hairstyle"> (Pinterest)
#mg_node = soup.find('img', id=re.compile(r"^i-.+")).find(" ", src=re.compile(r"https://s-media-cache-ak0.pinimg.com/\d.+?"))
'''
img_node = soup.find(re.compile(r"https://s-media-cache-ak0.pinimg.com/\d.+?"))
# Add the img in res_data
res_data["img"] = img_node
print 'now img_node add in res_data !'
'''
<p class="pinDescription">great piece for the living room or bedroom. I love the...</p>
<div class="pinMetaWrapper">
'''
summary_node = soup.find('div', class_="pinMetaWrapper")
# Add the summary in res_data
res_data["summary"] = summary_node.get_text()
print 'now summary_node add in res_data !'
return res_data
添加回答
举报
0/150
提交
取消