I am using node-readability module to cleapup the articles. But in most of the cases, the main image of the article is not part of the grabbed content. So, I have made some changes in the helpers.js and readability.js files to grab the main image of the article ,removing it from the DOM object and adding the image to the articleContent eventually. The code:
helpers.js
var grabImage = module.exports.grabImage = function(document) {
var images = document.getElementsByTagName('IMG');
var MINIMUM_SURFACE = 100*100;
if (images.length > 0) {
for (var i = 0; i < images.length; ++i) {
var image = images[i];
if ( image.getAttribute('data-src') ) {
image.getAttribute('src') = image.getAttribute('data-src');
}
if ( image.getAttribute('data-lazy-src') ) {
image.getAttribute('src') = image.getAttribute('data-lazy- src');
}
if ( !image.getAttribute('src') ) {
continue;
}
// //Compute surface
// var w = image.getAttribute('width') || 1;
// var h = image.getAttribute('height') || 1;
// image.surface = w * h;
var image = new Image();
image.onload = function() {
var image.surface = this.width*this.height;
}
//Filter by size
if ( image.surface > MINIMUM_SURFACE ) {
var mainImageSrc = image.getAttribute('src');
//Resolve relative url
if (!mainImageSrc.match(/^http/)) {
if (!image.ownerDocument.originalURL) {
} else{
mainImageSrc = url.resolve(image.ownerDocument.originalURL, mainImageSrc);
}
}
image.parentNode.removeChild(image);
break;
}
}
}
return mainImageSrc;
};
readability.js
var mainImgUrl = helpers.grabImage(this._document);
var img = this._document.createElement("IMG");
img.setAttribute('src', mainImgUrl);
articleContent.insertBefore(img, articleContent.childNodes[0] );
I am adding this above part of code in this function
Readability.prototype.getContent = function(notDeprecated) {
But, it' not working. The whole content is being grabbed ,but I'm getting this error
> Cleaning Conditionally [object HTMLDivElement] (image width-494:)
Cleaning Conditionally [object HTMLDivElement] (:)
fixed link
C:\Users\SAI\reader-rest\routes\api.js:19
var content = '<html><head><meta charset="utf-8"><title>'+articl
e.title+'</title></head><body>' +article.content+'</body></html>';
TypeError: Cannot read property 'title' of undefined
at C:\Users\SAI\reader-rest\routes\api.js:19:78
at Object.jsdom.env.done (C:\Users\SAI\reader-rest\node_modules\node-readabi
lity\src\readability.js:292:18)
at C:\Users\SAI\reader-rest\node_modules\node-readability\node_modules\jsdom
\lib\jsdom.js:259:18
at nextTickCallbackWith0Args (node.js:420:9)
at process._tickCallback (node.js:349:13)
typeerror: Cannot read property 'title' of undefined at C:\Users\SAI\reader-rest\routes\api.js:19:78 at Object.jsdom.env.done (C:\Users\SAI\reader-rest\node_modules\node-readability\src\readability.js:292:18) at C:\Users\SAI\reader-rest\node_modules\node-readability\node_modules\jsdom\lib\jsdom.js:259:18 at nextTickCallbackWith0Args (node.js:420:9) at process._tickCallback (node.js:349:13)
Can someone please help me resolve this issue.
I have posted the same on github . But No response. https://github.com/luin/readability/issues/52