I am loading a lot of xml documents and some of them return errors like "hexadecimal value 0x12, is an invalid character" and there are different character. How to remove them?
-
almost certainly you have a BOM at the start of your document see https://en.wikipedia.org/wiki/Byte_order_mark – pm100 May 11 '18 at 16:04
7 Answers
I made a small research here.
Here is the ASCII table. There are 128 symbols
Here is some small test code which adds every symbol from ASCII table and tries to load it as an XML document.
static public void RegexTry()
{
StreamReader stream = new StreamReader(@"test.xml");
string xmlfile = stream.ReadToEnd();
stream.Close();
string text = "";
for (int i = 0; i < 128; i++ )
{
char t = (char) i;
text = xmlfile.Replace('П', t);
XmlDocument xml = new XmlDocument();
try
{
xml.LoadXml(text);
}
catch (Exception ex)
{
Console.WriteLine("Char("+i.ToString() +"): " + t + " => error! " + ex.Message);
continue;
}
Console.WriteLine("Char(" + i.ToString() + "): " + t + " => fine!");
}
Console.ReadKey();
}
As a result it returns:
Char(0): => error! '.', hexadecimal value 0x00, is an invalid character. Line 5, position 7.
Char(1): => error! '', hexadecimal value 0x01, is an invalid character. Line 5, position 7.
Char(2): => error! '', hexadecimal value 0x02, is an invalid character. Line 5, position 7.
Char(3): => error! '', hexadecimal value 0x03, is an invalid character. Line 5, position 7.
Char(4): => error! '', hexadecimal value 0x04, is an invalid character. Line 5, position 7.
Char(5): => error! '', hexadecimal value 0x05, is an invalid character. Line 5, position 7.
Char(6): => error! '', hexadecimal value 0x06, is an invalid character. Line 5, position 7.
Char(7): => error! '', hexadecimal value 0x07, is an invalid character. Line 5, position 7.
Char(8): => error! '', hexadecimal value 0x08, is an invalid character. Line 5, position 7.
Char(9): => fine!
Char(10):
=> fine!
Char(11): => error! '', hexadecimal value 0x0B, is an invalid character. Line 5, position 7.
Char(12): => error! '', hexadecimal value 0x0C, is an invalid character. Line 5, position 7.
Char(13):
=> fine!
Char(14): => error! '', hexadecimal value 0x0E, is an invalid character. Line 5, position 7.
Char(15): => error! '', hexadecimal value 0x0F, is an invalid character. Line 5, position 7.
Char(16): => error! '', hexadecimal value 0x10, is an invalid character. Line 5, position 7.
Char(17): => error! '', hexadecimal value 0x11, is an invalid character. Line 5, position 7.
Char(18): => error! '', hexadecimal value 0x12, is an invalid character. Line 5, position 7.
Char(19): => error! '', hexadecimal value 0x13, is an invalid character. Line 5, position 7.
Char(20): => error! '', hexadecimal value 0x14, is an invalid character. Line 5, position 7.
Char(21): => error! '', hexadecimal value 0x15, is an invalid character. Line 5, position 7.
Char(22): => error! '', hexadecimal value 0x16, is an invalid character. Line 5, position 7.
Char(23): => error! '', hexadecimal value 0x17, is an invalid character. Line 5, position 7.
Char(24): => error! '', hexadecimal value 0x18, is an invalid character. Line 5, position 7.
Char(25): => error! '', hexadecimal value 0x19, is an invalid character. Line 5, position 7.
Char(26): => error! '', hexadecimal value 0x1A, is an invalid character. Line 5, position 7.
Char(27): => error! '', hexadecimal value 0x1B, is an invalid character. Line 5, position 7.
Char(28): => error! '', hexadecimal value 0x1C, is an invalid character. Line 5, position 7.
Char(29): => error! '', hexadecimal value 0x1D, is an invalid character. Line 5, position 7.
Char(30): => error! '', hexadecimal value 0x1E, is an invalid character. Line 5, position 7.
Char(31): => error! '', hexadecimal value 0x1F, is an invalid character. Line 5, position 7.
Char(32): => fine!
Char(33): ! => fine!
Char(34): " => fine!
Char(35): # => fine!
Char(36): $ => fine!
Char(37): % => fine!
Char(38): => error! An error occurred while parsing EntityName. Line 5, position 8.
Char(39): ' => fine!
Char(40): ( => fine!
Char(41): ) => fine!
Char(42): * => fine!
Char(43): + => fine!
Char(44): , => fine!
Char(45): - => fine!
Char(46): . => fine!
Char(47): / => fine!
Char(48): 0 => fine!
Char(49): 1 => fine!
Char(50): 2 => fine!
Char(51): 3 => fine!
Char(52): 4 => fine!
Char(53): 5 => fine!
Char(54): 6 => fine!
Char(55): 7 => fine!
Char(56): 8 => fine!
Char(57): 9 => fine!
Char(58): : => fine!
Char(59): ; => fine!
Char(60): => error! The '<' character, hexadecimal value 0x3C, cannot be included in a name. Line 5, position 13.
Char(61): = => fine!
Char(62): > => fine!
Char(63): ? => fine!
Char(64): @ => fine!
Char(65): A => fine!
Char(66): B => fine!
Char(67): C => fine!
Char(68): D => fine!
Char(69): E => fine!
Char(70): F => fine!
Char(71): G => fine!
Char(72): H => fine!
Char(73): I => fine!
Char(74): J => fine!
Char(75): K => fine!
Char(76): L => fine!
Char(77): M => fine!
Char(78): N => fine!
Char(79): O => fine!
Char(80): P => fine!
Char(81): Q => fine!
Char(82): R => fine!
Char(83): S => fine!
Char(84): T => fine!
Char(85): U => fine!
Char(86): V => fine!
Char(87): W => fine!
Char(88): X => fine!
Char(89): Y => fine!
Char(90): Z => fine!
Char(91): [ => fine!
Char(92): \ => fine!
Char(93): ] => fine!
Char(94): ^ => fine!
Char(95): _ => fine!
Char(96): ` => fine!
Char(97): a => fine!
Char(98): b => fine!
Char(99): c => fine!
Char(100): d => fine!
Char(101): e => fine!
Char(102): f => fine!
Char(103): g => fine!
Char(104): h => fine!
Char(105): i => fine!
Char(106): j => fine!
Char(107): k => fine!
Char(108): l => fine!
Char(109): m => fine!
Char(110): n => fine!
Char(111): o => fine!
Char(112): p => fine!
Char(113): q => fine!
Char(114): r => fine!
Char(115): s => fine!
Char(116): t => fine!
Char(117): u => fine!
Char(118): v => fine!
Char(119): w => fine!
Char(120): x => fine!
Char(121): y => fine!
Char(122): z => fine!
Char(123): { => fine!
Char(124): | => fine!
Char(125): } => fine!
Char(126): ~ => fine!
Char(127): => fine!
You can see there are a lot of symbols which can't be in XML code. To replace them we can use Reqex.Replace
static string ReplaceHexadecimalSymbols(string txt)
{
string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";
return Regex.Replace(txt, r,"",RegexOptions.Compiled);
}
PS. Sorry if everybody knew that.

- 8,018
- 10
- 47
- 62

- 1,844
- 2
- 16
- 26
-
9The big question would probably be how they get into an XML document in the first place. – PMF Jan 10 '14 at 19:50
-
9You should not use trial and error here. Consult the standard. My answer contains the pertinent portion. Trial and error is what has led to you writing a regex that removes all `&` characters from your XML documents. That will not end well! – David Heffernan Jan 10 '14 at 19:55
The XML specification defines the valid characters like this:
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
As you can see #x12
is not a valid character in an XML document.
You ask how to remove them but I think that is not the question you should be asking. They should simply not be present. You should reject any such document as mal-formed. Simply removing invalid characters suppresses the real problem.
If you are creating the documents in question then you need to fix the code that generates it so that it generates valid XML.

- 601,492
- 42
- 1,072
- 1,490
I think x26 "&" is a valid character and it could be de-serialized by XML.
So to replace illegal character, we should use:
// Replace illegal character in XML documents with blank
// See here for reference http://www.w3.org/TR/xml/#charsets
var regex = "[\x00-\x08\x0B\x0C\x0E-\x1F]";
xml = Regex.Replace(xml, regex, String.Empty, RegexOptions.Compiled);

- 8,109
- 4
- 36
- 50

- 111
- 1
- 3
Regex solution is working quite fast even on 100MB XML document.
The following expression string would do the work.
"[\x00-\x08\x0B\x0C\x0E-\x1F]"

- 115
- 1
- 1
- 6
This is essentially a special case of this question. I suggest you use one of the answers from there.

- 1
- 1

- 7,216
- 2
- 35
- 46
Just update these functions with the mentioned fix provided by jhon and where you have to update check these functions in your code. it will work for you i have tested.
private static void WriteDataTableToExcelWorksheet(DataTable dt, WorksheetPart worksheetPart)
{
var worksheet = worksheetPart.Worksheet;
var sheetData = worksheet.GetFirstChild<SheetData>();
string cellValue = "";
// Create a Header Row in our Excel file, containing one header for each Column of data in our DataTable.
//
// We'll also create an array, showing which type each column of data is (Text or Numeric), so when we come to write the actual
// cells of data, we'll know if to write Text values or Numeric cell values.
int numberOfColumns = dt.Columns.Count;
bool[] IsNumericColumn = new bool[numberOfColumns];
string[] excelColumnNames = new string[numberOfColumns];
for (int n = 0; n < numberOfColumns; n++)
excelColumnNames[n] = GetExcelColumnName(n);
//
// Create the Header row in our Excel Worksheet
//
uint rowIndex = 1;
var headerRow = new Row { RowIndex = rowIndex }; // add a row at the top of spreadsheet
sheetData.Append(headerRow);
for (int colInx = 0; colInx < numberOfColumns; colInx++)
{
DataColumn col = dt.Columns[colInx];
AppendTextCell(excelColumnNames[colInx] + "1", col.ColumnName, headerRow);
IsNumericColumn[colInx] = (col.DataType.FullName == "System.Decimal") || (col.DataType.FullName == "System.Int32");
}
//
// Now, step through each row of data in our DataTable...
//
double cellNumericValue = 0;
foreach (DataRow dr in dt.Rows)
{
// ...create a new row, and append a set of this row's data to it.
++rowIndex;
var newExcelRow = new Row { RowIndex = rowIndex }; // add a row at the top of spreadsheet
sheetData.Append(newExcelRow);
for (int colInx = 0; colInx < numberOfColumns; colInx++)
{
cellValue = dr.ItemArray[colInx].ToString();
// Create cell with data
if (IsNumericColumn[colInx])
{
// For numeric cells, make sure our input data IS a number, then write it out to the Excel file.
// If this numeric value is NULL, then don't write anything to the Excel file.
cellNumericValue = 0;
if (double.TryParse(cellValue, out cellNumericValue))
{
cellValue = ReplaceHexadecimalSymbols(cellNumericValue.ToString());
AppendNumericCell(excelColumnNames[colInx] + rowIndex.ToString(), cellValue, newExcelRow);
}
}
else
{
// For text cells, just write the input data straight out to the Excel file.
AppendTextCell(excelColumnNames[colInx] + rowIndex.ToString(), cellValue, newExcelRow);
}
}
}
}
static string ReplaceHexadecimalSymbols(string txt)
{
string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";
return Regex.Replace(txt, r, "", RegexOptions.Compiled);
}
private static void AppendTextCell(string cellReference, string cellStringValue, Row excelRow)
{
// Add a new Excel Cell to our Row
Cell cell = new Cell() { CellReference = cellReference, DataType = CellValues.String };
CellValue cellValue = new CellValue();
cellValue.Text = ReplaceHexadecimalSymbols(cellStringValue);
cell.Append(cellValue);
excelRow.Append(cell);
}
private static void AppendNumericCell(string cellReference, string cellStringValue, Row excelRow)
{
// Add a new Excel Cell to our Row
Cell cell = new Cell() { CellReference = cellReference };
CellValue cellValue = new CellValue();
cellValue.Text = ReplaceHexadecimalSymbols(cellStringValue);
cell.Append(cellValue);
excelRow.Append(cell);
}
Thanks let me know if u need further help.

- 155
- 1
- 5
- 16
What worked for me was checking the special characters. The special character which was failing had the char code similar to '&' which is also used in '<'
public static string CleanInvalidEscapedXmlCharacters(string s)
{
if(s == null ) return null;
StringBuilder sbOutput = new StringBuilder();
char ch;
//keeps track of which character the previous character was.
bool hitAmp = false;
bool hitPound = false;
bool hitX = false;
string escapedHold = "";
for( int i = 0; i < s.Length; i++ )
{
ch = s[i];
//check this first so that the x gets ignored.
if(hitX)
{
//found the end of the escaped portion
if(ch == ';')
{
ch = (char) Int32.Parse(escapedHold, NumberStyles.AllowHexSpecifier);
escapedHold = "";
hitX = false;
hitPound = false;
}
else
{
//found another digit in the escaped portion
escapedHold += ch;
continue;
}
}
if(hitPound)
{
if(ch == 'x')
{
//found &#x
hitX = true;
continue;
}
else
{
//found &# but no x
//reset hits and output &# and current character.
hitAmp = false;
hitPound = false;
sbOutput.Append('&');
sbOutput.Append('#');
sbOutput.Append(ch);
continue;
}
}
if(ch == '&')
{
//found an initial &
hitAmp = true;
continue;
}
if (hitAmp)
{
if (ch == '#')
{
//found &#
hitPound = true;
hitAmp = false;
continue;
}
else
{
//found & but no # so this is something like <
//reset hits and output the & and current character
hitAmp = false;
hitPound = false;
sbOutput.Append('&');
sbOutput.Append(ch);
continue;
}
}
if(!hitAmp && !hitPound && !hitX)
{
if ((ch >= 0x0020 && ch <= 0xD7FF) ||
(ch >= 0xE000 && ch <= 0xFFFD) ||
ch == 0x0009 || ch == 0x000A || ch == 0x000D)
{
sbOutput.Append(ch);
}
}
}
return sbOutput.ToString();
}

- 2,025
- 5
- 22
- 35