my code below uses C# and HTMLAgilityPack to scrape a webpage and then uses WebClient to download a string from another webpage. This works great on localhost, but when I publish my code as an API service on Azure or execute it on a web hosting service (i.e. host gator), I always receive a 403 forbidden error. I've tried so many ways to get this to work and cannot for the life of me figure this out. Any help would be greatly appreciated.
HtmlWeb web = new HtmlWeb();
HtmlDocument doc = web.Load("https://antenati.cultura.gov.it/ark:/12657/an_ud18290200");
//string returnedResult = doc.DocumentNode.OuterHtml; //this shows a 403 forbidden error response when not running from localhost.
string ress = doc.DocumentNode.SelectSingleNode("//*[text()[contains(., 'manifestId:')]]").InnerText;
if (!string.IsNullOrEmpty(ress))
{
string[] strPieces = ress.Split(new string[] { "manifestId:" }, StringSplitOptions.None);
if (strPieces.Length >= 2)
{
WebClient wb = new WebClient();
string manifestUrl = strPieces[1].Split(',')[0].Replace("'", "").Trim();
wb.Headers.Add("origin", "https://antenati.cultura.gov.it");
wb.Headers.Add("referer", "https://antenati.cultura.gov.it/");
wb.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36");
string result = wb.DownloadString(manifestUrl);
}
}
Code I have tried that results in a 403 error on https://dotnetfiddle.net:
using System;
using System.IO;
using System.Net;
public class Program
{
public static void Main()
{
string text = "";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://antenati.cultura.gov.it/ark:/12657/an_ud18290200");
//request.Proxy = new WebProxy("173.192.21.89", 80);
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8";
//request.Connection = "keep-alive";
request.Headers.Add("Accept-Language", "en-US,en;q=0.5");
//request.Headers.Add("Accept-Encoding", "gzip, deflate");
request.Headers.Add("Upgrade-Insecure-Requests", "1");
request.Headers.Add("Sec-Fetch-Dest", "document");
request.Headers.Add("Sec-Fetch-Mode", "navigate");
request.Headers.Add("Sec-Fetch-Site", "none");
request.Headers.Add("Sec-Fetch-User", "?1");
request.Headers.Add("Cache-Control", "max-age=0");
// Get the response.
WebResponse response = request.GetResponse();
using (var sr = new StreamReader(response.GetResponseStream()))
{
text = sr.ReadToEnd();
}
Console.WriteLine(text);
}
}