DOM is the correct way. It allows you to find and traverse text nodes. Use RegEx on the content of these nodes and build the new nodes up as a fragment.
function wrapMatches(\DOMNode $node, string $pattern, string $tagName, $tagAttributes = []) {
$document = $node instanceof DOMDocument ? $node : $node->ownerDocument;
$xpath = new DOMXpath($document);
// iterate all descendant text nodes
foreach ($xpath->evaluate('.//text()', $node) as $textNode) {
$content = $textNode->textContent;
$found = preg_match_all($pattern, $content, $matches, PREG_OFFSET_CAPTURE);
$offset = 0;
if ($found) {
// fragments allow to treat multiple nodes as one
$fragment = $document->createDocumentFragment();
foreach ($matches[0] as $match) {
list($matchContent, $matchStart) = $match;
// add text from last match to current
$fragment->appendChild(
$document->createTextNode(substr($content, $offset, $matchStart - $offset))
);
// add wrapper element, ...
$wrapper = $fragment->appendChild($document->createElement($tagName));
// ... set its attributes ...
foreach ($tagAttributes as $attributeName => $attributeValue) {
$wrapper->setAttribute($attributeName, $attributeValue);
}
// ... and add the text content
$wrapper->textContent = $matchContent;
$offset = $matchStart + strlen($matchContent);
}
// add text after last match
$fragment->appendChild($document->createTextNode(substr($content, $offset)));
// replace the text node with the new fragment
$textNode->parentNode->replaceChild($fragment, $textNode);
}
}
}
$xml = <<<'XML'
<speak>The test number is 123456789, and some further block of text.</speak>
XML;
$document = new DOMDocument();
$document->loadXML($xml);
wrapMatches($document, '(\d+)u', 'say-as', ['interpret-as' => 'characters']);
echo $document->saveXML();