Currently working on a url extractor for work. I'm trying to extract all http links/ href links from a downloaded html file and print the links on there own in a separate txt file.So far I've managed to get the entire html of a page downloaded its just extracting the links from it and printing them using Regex is a problem. Wondering if anyone could help me with this?
private void button2_Click(object sender, EventArgs e)
{
Uri fileURI = new Uri(URLbox2.Text);
WebRequest request = WebRequest.Create(fileURI);
request.Credentials = CredentialCache.DefaultCredentials;
WebResponse response = request.GetResponse();
Console.WriteLine(((HttpWebResponse)response).StatusDescription);
Stream dataStream = response.GetResponseStream();
StreamReader reader = new StreamReader(dataStream);
string responseFromServer = reader.ReadToEnd();
SW = File.CreateText("C:\\Users\\Conal_Curran\\OneDrive\\C#\\MyProjects\\Web Crawler\\URLTester\\response1.htm");
SW.WriteLine(responseFromServer);
SW.Close();
string text = System.IO.File.ReadAllText(@"C:\\Users\\Conal_Curran\\OneDrive\\C#\\MyProjects\\Web Crawler\\URLTester\\response1.htm");
string[] links = System.IO.File.ReadAllLines(@"C:\\Users\\Conal_Curran\\OneDrive\\C#\\MyProjects\\Web Crawler\\URLTester\\response1.htm");
Regex regx = new Regex(links, @"http://([\\w+?\\.\\w+])+([a-zA-Z0-9\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)_\\-\\=\\+\\\\\\/\\?\\.\\:\\;\\'\\,]*)?", RegexOptions.IgnoreCase);
MatchCollection mactches = regx.Matches(text);
foreach (Match match in mactches)
{
text = text.Replace(match.Value, "<a href='" + match.Value + "'>" + match.Value + "</a>");
}
SW = File.CreateText("C:\\Users\\Conal_Curran\\OneDrive\\C#\\MyProjects\\Web Crawler\\URLTester\\Links.htm");
SW.WriteLine(links);
}