You can use a regular expression to pull apart a JavaScript regular expression.
Then you should convert the regex to a lexically simpler subset of JavaScript that avoids all the non-context-free weirdness about what /
means, and any irregularities in the input regex.
var REGEXP_PARTS = "(?:"
// A regular character
+ "[^/\r\n\u2028\u2029\\[\\\\]"
// An escaped character, charset reference or backreference
+ "|\\\\[^\r\n\u2028\u2029]"
// A character set
+ "|\\[(?!\\])(?:[^\\]\\\\]|\\\\[^\r\n\u2028\u2029])+\\]"
+ ")";
var REGEXP_REGEXP = new RegExp(
// A regex starts with a slash
"^[/]"
// It cannot be lexically ambiguous with a line or block comemnt
+ "(?![*/])"
// Capture the body in group 1
+ "(" + REGEXP_PARTS + "+)"
// The body is terminated by a slash
+ "[/]"
// Capture the flags in group 2
+ "([gmi]{0,3})$");
var match = myString.match(REGEXP_REGEXP);
if (match) {
var ctorExpression =
"(new RegExp("
// JSON.stringify escapes special chars in the body, so will
// preserve token boundaries.
+ JSON.stringify(match[1])
+ "," + JSON.stringify(match[2])
+ "))";
alert(ctorExpression);
}
which will result in an expression that is in a well-understood subset of JavaScript.
The complex regex above is not in the TCB. The only part that needs to function correctly for security to hold is the ctorExpression
including the use of JSON.stringify
.