Short of going to something more complex like measuring information or doing some natural language processing, you can estimate which element on a page contains the content by determining which element has the highest ratio of contained content to contained markup. Here’s a javascript snippet that does just that:
// not perfect obviously. Not terrible neither.
var id, tag;
var all = document.querySelectorAll('body *'), max = 0, el, i, L;
// list some commons ids that denote the outermost element on a page.
var badIds = {
"wrapper" : 1,
"container" : 1,
"wrapper-content" : 1
};
// we don't want to include content from certain tags.
var badTags = {
"SCRIPT" : 1,
"STYLE" : 1,
"HEADER" : 1
}
// the goal rate of markup per content
var contentPercent = 0.45;
var contentRatio = function(el) {
var i, L, totalScript = 0, scripts = el.getElementsByTagName("script");
for (i =0, L= scripts.length; i < L; i++) {
totalScript += scripts[i].length;
}
totalScript = 0;
return (el.textContent.length - totalScript) / el.innerHTML.length;
};
for (i = 0, L =all.length; i < L; i++) {
id = all[i].getAttribute('id');
tag = all[i].tagName;
if (all[i].textContent && all[i].textContent.length > max && (contentRatio(all[i]) > contentPercent) && !badIds[id] && !badTags[tag]) {
max = all[i].textContent.length;
el = all[i];
}
}
// show the results.
console.log(el)
console.log(el.textContent.length / el.innerHTML.length)