Don't use regex on raw HTML. Use it only on text. This is because regex is a context free parser but HTML is a recursive language. You need a recursive descent parser to properly handle HTML.
First a few useful features of the DOM:
document.body
is the root of the DOM
- Every node of the DOM has a
childNodes
array (even comments, text, and attributes)
- Element nodes such as
<span>
or <h>
don't contain text, instead they contain text nodes that contain text.
- All nodes have a
nodeType
property and text node is type 3
.
- All nodes have a
nodeValue
property that holds different things depending on what kind of node it is. For text nodes nodeValue
contains the actual text.
So, using the information above we can surround all words with a span.
First a simple utility function that allows us to process the DOM:
// First a simple implementation of recursive descent,
// visit all nodes in the DOM and process it with a callback:
function walkDOM (node,callback) {
if (node.nodeName != 'SCRIPT') { // ignore javascript
callback(node);
for (var i=0; i<node.childNodes.length; i++) {
walkDOM(node.childNodes[i],callback);
}
}
}
Now we can walk the DOM and find text nodes:
var textNodes = [];
walkDOM(document.body,function(n){
if (n.nodeType == 3) {
textNodes.push(n);
}
});
Note that I'm doing this in two steps to avoid wrapping words twice.
Now we can process the text nodes:
// simple utility functions to avoid a lot of typing:
function insertBefore (new_element, element) {
element.parentNode.insertBefore(new_element,element);
}
function removeElement (element) {
element.parentNode.removeChild(element);
}
function makeSpan (txt, attrs) {
var s = document.createElement('span');
for (var i in attrs) {
if (attrs.hasOwnProperty(i)) s[i] = attrs[i];
}
s.appendChild(makeText(txt));
return s;
}
function makeText (txt) {return document.createTextNode(txt)}
var id_count = 1;
for (var i=0; i<textNodes.length; i++) {
var n = textNodes[i];
var txt = n.nodeValue;
var words = txt.split(' ');
// Insert span surrounded words:
insertBefore(makeSpan(words[0],{id:id_count++}),n);
for (var j=1; j<words.length; j++) {
insertBefore(makeText(' '),n); // join the words with spaces
insertBefore(makeSpan(words[j],{id:id_count++}),n);
}
// Now remove the original text node:
removeElement(n);
}
There you have it. It's cumbersome but is 100% safe - it will never corrupt other tags of javascript in your page. A lot of the utility functions I have above can be replaced with the library of your choice. But don't take the shortcut of treating the entire document as a giant innerHTML
string. Not unless you're willing to write an HTML parser in pure javascript.