I am attempting to write an implementation for the following (prototype) method:
var result = browser.GetHtml(string url);
The reason I need this is because there are a number of pages that push a mound of Javascript to the browser, and then the Javascript renders the page. The only way to retrieve such pages reliably is to allow the Javascript to execute in a browser environment before retrieving the resulting HTML.
My current attempt is using CefGlue. After downloading this project and combining it with the code in this answer I came up with the following code (included here for completeness):
using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.Drawing.Printing;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Xilium.CefGlue;
namespace OffScreenCefGlue
{
internal class Program
{
private static void Main(string[] args)
{
// Load CEF. This checks for the correct CEF version.
CefRuntime.Load();
// Start the secondary CEF process.
var cefMainArgs = new CefMainArgs(new string[0]);
var cefApp = new DemoCefApp();
// This is where the code path divereges for child processes.
if (CefRuntime.ExecuteProcess(cefMainArgs, cefApp) != -1)
{
Console.Error.WriteLine("CefRuntime could not create the secondary process.");
}
// Settings for all of CEF (e.g. process management and control).
var cefSettings = new CefSettings
{
SingleProcess = false,
MultiThreadedMessageLoop = true
};
// Start the browser process (a child process).
CefRuntime.Initialize(cefMainArgs, cefSettings, cefApp);
// Instruct CEF to not render to a window at all.
CefWindowInfo cefWindowInfo = CefWindowInfo.Create();
cefWindowInfo.SetAsOffScreen(IntPtr.Zero);
// Settings for the browser window itself (e.g. should JavaScript be enabled?).
var cefBrowserSettings = new CefBrowserSettings();
// Initialize some the cust interactions with the browser process.
// The browser window will be 1280 x 720 (pixels).
var cefClient = new DemoCefClient(1280, 720);
// Start up the browser instance.
string url = "http://www.reddit.com/";
CefBrowserHost.CreateBrowser(cefWindowInfo, cefClient, cefBrowserSettings, url);
// Hang, to let the browser do its work.
Console.Read();
// Clean up CEF.
CefRuntime.Shutdown();
}
}
internal class DemoCefApp : CefApp
{
}
internal class DemoCefClient : CefClient
{
private readonly DemoCefLoadHandler _loadHandler;
private readonly DemoCefRenderHandler _renderHandler;
public DemoCefClient(int windowWidth, int windowHeight)
{
_renderHandler = new DemoCefRenderHandler(windowWidth, windowHeight);
_loadHandler = new DemoCefLoadHandler();
}
protected override CefRenderHandler GetRenderHandler()
{
return _renderHandler;
}
protected override CefLoadHandler GetLoadHandler()
{
return _loadHandler;
}
}
internal class DemoCefLoadHandler : CefLoadHandler
{
public string Html { get; private set; }
protected override void OnLoadStart(CefBrowser browser, CefFrame frame)
{
// A single CefBrowser instance can handle multiple requests
// for a single URL if there are frames (i.e. <FRAME>, <IFRAME>).
if (frame.IsMain)
{
Console.WriteLine("START: {0}", browser.GetMainFrame().Url);
}
}
protected override async void OnLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode)
{
if (frame.IsMain)
{
Html = await browser.GetSourceAsync();
Console.WriteLine("END: {0}, {1}", browser.GetMainFrame().Url, httpStatusCode);
}
}
}
internal class DemoCefRenderHandler : CefRenderHandler
{
private readonly int _windowHeight;
private readonly int _windowWidth;
public DemoCefRenderHandler(int windowWidth, int windowHeight)
{
_windowWidth = windowWidth;
_windowHeight = windowHeight;
}
protected override bool GetRootScreenRect(CefBrowser browser, ref CefRectangle rect)
{
return GetViewRect(browser, ref rect);
}
protected override bool GetScreenPoint(CefBrowser browser, int viewX, int viewY, ref int screenX, ref int screenY)
{
screenX = viewX;
screenY = viewY;
return true;
}
protected override bool GetViewRect(CefBrowser browser, ref CefRectangle rect)
{
rect.X = 0;
rect.Y = 0;
rect.Width = _windowWidth;
rect.Height = _windowHeight;
return true;
}
protected override bool GetScreenInfo(CefBrowser browser, CefScreenInfo screenInfo)
{
return false;
}
protected override void OnPopupSize(CefBrowser browser, CefRectangle rect)
{
}
protected override void OnPaint(CefBrowser browser, CefPaintElementType type, CefRectangle[] dirtyRects, IntPtr buffer, int width, int height)
{
// Save the provided buffer (a bitmap image) as a PNG.
var bitmap = new Bitmap(width, height, width*4, PixelFormat.Format32bppRgb, buffer);
bitmap.Save("LastOnPaint.png", ImageFormat.Png);
}
protected override void OnCursorChange(CefBrowser browser, IntPtr cursorHandle)
{
}
protected override void OnScrollOffsetChanged(CefBrowser browser)
{
}
}
public class TaskStringVisitor : CefStringVisitor
{
private readonly TaskCompletionSource<string> taskCompletionSource;
public TaskStringVisitor()
{
taskCompletionSource = new TaskCompletionSource<string>();
}
protected override void Visit(string value)
{
taskCompletionSource.SetResult(value);
}
public Task<string> Task
{
get { return taskCompletionSource.Task; }
}
}
public static class CEFExtensions
{
public static Task<string> GetSourceAsync(this CefBrowser browser)
{
TaskStringVisitor taskStringVisitor = new TaskStringVisitor();
browser.GetMainFrame().GetSource(taskStringVisitor);
return taskStringVisitor.Task;
}
}
}
The relevant bit of code is here:
protected override async void OnLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode)
{
if (frame.IsMain)
{
Html = await browser.GetSourceAsync();
Console.WriteLine("END: {0}, {1}", browser.GetMainFrame().Url, httpStatusCode);
}
}
This actually appears to work; you can examine the Html variable with the debugger, and there is an HTML page in there. The problem is, the Html variable does me no good in that callback method; it's buried three layers deep in a class hierarchy, and I need to return it in the method I'm trying to write without creating a Schroedinbug.
(attempting to get the result from that string Html
property, including trying to view it with the Html visualizer in the debugger, appears to cause a deadlock, something I'd really like to avoid, especially since this code is going to run on a server).
How do I achieve my var result = browser.GetHtml(string url);
safely and reliably?
Bonus question: Could the callback mechanisms in the above code be converted to Tasks using this technique? What would that look like?