I am new in python and I want to filter html tags by using regex. I used the function as below:
def filter_tags(htmlstr):
re_cdata=re.compile('//<!\[CDATA\[.*//\]\]>',re.DOTALL)
re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.DOTALL)#Script
re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style
re_br=re.compile('<br\s*?/?>')
re_h=re.compile('</?\w+[^>]*>')
re_function = re.compile('')
re_comment=re.compile('<!--[^>]*-->')
s=re_cdata.sub('',htmlstr)
s=re_script.sub('',s)
s=re_style.sub('',s)
s=re_br.sub('',s)
s=re_h.sub('',s)
s=re_comment.sub('',s)
s = re.sub('\\t','',s)
s = re.sub(' ','',s)
return s
Most tags and codes can be removed except some js functions, and I got some trouble like this:
(function(){
NTES.ajax.importJs('http://news.163.com/special/hot_tags_recommend_data/',function(){
varname1,name2,len1,len2,width1,width2,left2;
varloveData=['拎婚房待嫁北京爷们','请网友鉴定是否美女'];
if(hotTagsData.count&&hotTagsData.count>0){
varcode='#from=article',
html=[],
item={name:'',url:''};
for(vari=0;i<hotTagsData.data.length&&i<4;i++){
item=hotTagsData.data[i];
html.push(''+item.name+'');
if(i==1){name1=item.name;}
if(i==2){name2=item.name;}
}
html.push(loveData[0]);
html.push(loveData[1]);
NTES('#js-extraTagList').innerHTML=html.join('');
len1=name1.replace(/[^\x00-\xff]/gi,"aa").length;
len2=name2.replace(/[^\x00-\xff]/gi,"aa").length;
width1=Math.floor((len1/(len1+len2))*271);
width2=271-width1;
left2=96+width1+19;
NTES('.extra-tag-1').addCss('width:'+width1+'px');
NTES('.extra-tag-2').addCss('width:'+width2+'px;left:'+left2+'px;');
}
},'gbk');
})();
As you can see, there are many founctions like this.So how can I remove these by using regex? thanks a lot.