0

I get all links on the current page, and then I looking for link that I need and then I want to get anchor (text between open and end tag of "a") of this link. I tried to use "obj.GetAttribute("innerText")", but it returns an empty string.

WebClient client = new WebClient();
string htmlCode = client.DownloadString("http://mysite1.com");

CQ cq = CQ.Create(htmlCode);
foreach (IDomObject obj in cq.Find("a")){
 string href = obj.GetAttribute("href");
   if (href.IndexOf("mysite2.com") != -1){
      //get the anchor of this link
   }
 }
Alex
  • 121
  • 5

1 Answers1

0

Finally solve it.

using CsQuery;

CQ cq = CQ.Create(htmlCode);
foreach (IDomObject obj in cq.Find("a")){
        string linkAnchor = obj.InnerHTML;
}

But there is problem with russian text. In some cases (not always) russian text is reading as unicode char codes. For example all russian chars are like this "&#1013". So I wrote function that decode such representation of russian chars in russian chars.

private string DecodeFromUTFCode(string input){
    input = input.Replace("&#", "");
    StringBuilder decodedAnchor = new StringBuilder();
    StringBuilder currentUnicodeNum = new StringBuilder();
    bool isInNumber = false;

    for (int i = 0; i <= input.Length - 1; i++){
        if (Char.IsDigit(input[i])){
            isInNumber = true;
        }else{
            isInNumber = false;
            if (input[i] != ';') decodedAnchor.Append(input[i]);
        }

        if (isInNumber){
            currentUnicodeNum.Append(input[i]);
        }

        if ((input[i] == ';') || (i == input.Length - 1)){
            string decoded = char.ConvertFromUtf32(int.Parse(currentUnicodeNum.ToString()));
            decodedAnchor.Append(decoded);
            currentUnicodeNum.Clear();
        }
    }

    return decodedAnchor.ToString();
}
Alex
  • 121
  • 5