I'm currently making a Web Crawler in C#, and I have a method that receives HTML strings, extracts links from them and inserts the links into the list of all the links captured.
Since it is multi-threaded, I have used locks to prevent the list of all the strings to be accessed from a few different threads at the same time.
Which is better to do with locks?
This:
void ProcessHTML(string HTML)
{
List<string> Links = GetLinks(HTML);
for (int i = 0; i < Links.Count; i++)
{
lock (WebsitesHash)
{
lock (AllLinks)
{
if (!WebsitesHash.ContainsKey(Links[i]))
{
WebsitesHash[Links[i]] = true;
AllLinks.Add(Links[i]);
}
}
}
}
}
Or this:
void ProcessHTML(string HTML)
{
List<string> Links = GetLinks(HTML);
lock (WebsitesHash)
{
lock (AllLinks)
{
for (int i = 0; i < Links.Count; i++)
{
if (!WebsitesHash.ContainsKey(Links[i]))
{
WebsitesHash[Links[i]] = true;
AllLinks.Add(Links[i]);
}
}
}
}
}
Which is generally considered better to do - lock in each iteration, or lock all the iterations?
Other code that might be relevant:
void StartCrawl(string Seed)
{
AllLinks.Capacity = 1000 * 1000 * 10;
StreamWriter Log = new StreamWriter(File.Open("Websites.txt", FileMode.Append));
string HTML = GetHTML(Seed);
ProcessHTML(HTML);
for (int i = 0; i < AllLinks.Count; i++)
{
if (!Work)
{
Log.Close();
WebsitesHash = new Dictionary<string, bool>();
break;
}
Log.WriteLine(AllLinks[i]);
websText.Text = AllLinks.Count + "";
try { HTML = GetHTML(AllLinks[i]); }
catch { continue; }
Thread Parser = new Thread(() => ProcessHTML(HTML));
Parser.Start();
}
}