I have to convert some beautifulsoup code. Basically what I want is just get all children of the body node and select which has text and store them. Here is the code with bs4 :
def get_children(self, tag, dorecursive=False):
children = []
if not tag :
return children
for t in tag.findChildren(recursive=dorecursive):
if t.name in self.text_containers \
and len(t.text) > self.min_text_length \
and self.is_valid_tag(t):
children.append(t)
return children
this works fine when I try this with lxml lib instead, children is empty :
def get_children(self, tag, dorecursive=False):
children = []
if not tag :
return children
tags = tag.getchildren()
for t in tags:
#print(t.tag)
if t.tag in self.text_containers \
and len(t.tail) > self.min_text_length \
and self.is_valid_tag(t):
children.append(t)
return children
any idea ?