-2
>>> data
'<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=0&amp;hidelinks=1" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পূর্ববর্তী ৫০টি</a>) (<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=950505&amp;hidelinks=1&amp;back=776017" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পরবর্তী ৫০টি</a>'
>>> next = re.findall('<a href="(.*?)".*?>পরবর্তী ৫০টি</a>',data)
>>> next
['/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=0&amp;hidelinks=1']

see the Image here

I am trying to find what inside the second anchor tag, But why am I getting data from the first tag?

Wiktor Stribiżew
  • 607,720
  • 39
  • 448
  • 563

1 Answers1

1

Any reason why you're using regex to parse HTML? Why not? Read this.

You should be using BeautifulSoup for example:

from bs4 import BeautifulSoup

a = '<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=0&amp;hidelinks=1" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পূর্ববর্তী ৫০টি</a>) (<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=950505&amp;hidelinks=1&amp;back=776017" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পরবর্তী ৫০টি</a>'

print(BeautifulSoup(a, "html.parser").find_all("a"))

This gets you the second anchor.

from bs4 import BeautifulSoup

a = '<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=0&amp;hidelinks=1" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পূর্ববর্তী ৫০টি</a>) (<a href="/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&amp;from=950505&amp;hidelinks=1&amp;back=776017" title="বিশেষ:সংযোগকারী পৃষ্ঠাসমূহ/টেমপ্লেট:বিষয়শ্রেণীহীন">পরবর্তী ৫০টি</a>'

print([i.get("href") for i in BeautifulSoup(a, "html.parser").find_all("a") if i.text == "পরবর্তী ৫০টি"])

Output:

/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%B8%E0%A6%82%E0%A6%AF%E0%A7%8B%E0%A6%97%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A7%80_%E0%A6%AA%E0%A7%83%E0%A6%B7%E0%A7%8D%E0%A6%A0%E0%A6%BE%E0%A6%B8%E0%A6%AE%E0%A7%82%E0%A6%B9/%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%AA%E0%A7%8D%E0%A6%B2%E0%A7%87%E0%A6%9F:%E0%A6%AC%E0%A6%BF%E0%A6%B7%E0%A6%AF%E0%A6%BC%E0%A6%B6%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%A3%E0%A7%80%E0%A6%B9%E0%A7%80%E0%A6%A8&from=950505&hidelinks=1&back=776017
baduker
  • 19,152
  • 9
  • 33
  • 56
  • First of all, I don't know BeautifulSoup. Second, the data variable is a segmentation of a full page. Your solution only gives the anchor tag with some specific indexing. I need a special type of anchor within a page with above pattern. – Abdullah AL Shohag Sep 20 '20 at 08:15
  • one more thing, what about this `amp;` converting from `&`? – Abdullah AL Shohag Sep 20 '20 at 08:21
  • I'm using the raw string you gave so I can't account for `amp;` and other elements, as I don't have the entire source. I've updated the answer that should find all `href` for a given `a` with the text value from your regex. – baduker Sep 20 '20 at 08:36
  • what you did here is, find all anchor tag and check if it matches with text `পরবর্তী ৫০টি` then return the what's inside `href`. But you used a loop while the complexity gets bigger which is not desired. – Abdullah AL Shohag Sep 20 '20 at 08:48
  • Complexity with a list comprehension? Using regex to parse HTML is complexity. – baduker Sep 20 '20 at 08:51
  • Are you sure about this? – Abdullah AL Shohag Sep 20 '20 at 08:58