C # valeur hexadécimale 0x12, est un caractère invalide

je charge beaucoup de documents xml et certains d'entre eux renvoient des erreurs comme" valeur hexadécimale 0x12, est un caractère invalide " et il y a des caractères différents. Comment les supprimer?

21
demandé sur Duke 2014-01-10 23:46:29

6 réponses

j'ai fait une petite recherche ici.

Voici la table ASCII. Il y a 128 symboles asciitable Voici un petit code de test qui ajoute chaque symbole de la table ASCII et tente de le charger comme un document XML.

static public void RegexTry()
{
    StreamReader stream = new StreamReader(@"test.xml");
    string xmlfile = stream.ReadToEnd();
    stream.Close();

    string text = "";

    for (int i = 0; i < 128; i++ )
    {
        char t = (char) i;

        text = xmlfile.Replace('П', t);

        XmlDocument xml = new XmlDocument();
        try
        {
            xml.LoadXml(text);
        }
        catch (Exception ex)
        {
            Console.WriteLine("Char("+i.ToString() +"): " + t + " => error! " + ex.Message);
            continue;
        }

        Console.WriteLine("Char(" + i.ToString() + "): " + t + " => fine!");
    }

    Console.ReadKey();
}

en conséquence il retourne:

Char(0): => error! '.', hexadecimal value 0x00, is an invalid character. Line 5, position 7.
Char(1): => error! '', hexadecimal value 0x01, is an invalid character. Line 5, position 7.
Char(2): => error! '', hexadecimal value 0x02, is an invalid character. Line 5, position 7.
Char(3): => error! '', hexadecimal value 0x03, is an invalid character. Line 5, position 7.
Char(4): => error! '', hexadecimal value 0x04, is an invalid character. Line 5, position 7.
Char(5): => error! '', hexadecimal value 0x05, is an invalid character. Line 5, position 7.
Char(6): => error! '', hexadecimal value 0x06, is an invalid character. Line 5, position 7.
Char(7): => error! '', hexadecimal value 0x07, is an invalid character. Line 5, position 7.
Char(8): => error! '', hexadecimal value 0x08, is an invalid character. Line 5, position 7.
Char(9):     => fine!
Char(10): 
 => fine!
Char(11): => error! '', hexadecimal value 0x0B, is an invalid character. Line 5, position 7.
Char(12): => error! '', hexadecimal value 0x0C, is an invalid character. Line 5, position 7.
Char(13): 
 => fine!
Char(14): => error! '', hexadecimal value 0x0E, is an invalid character. Line 5, position 7.
Char(15): => error! '', hexadecimal value 0x0F, is an invalid character. Line 5, position 7.
Char(16): => error! '', hexadecimal value 0x10, is an invalid character. Line 5, position 7.
Char(17): => error! '', hexadecimal value 0x11, is an invalid character. Line 5, position 7.
Char(18): => error! '', hexadecimal value 0x12, is an invalid character. Line 5, position 7.
Char(19): => error! '', hexadecimal value 0x13, is an invalid character. Line 5, position 7.
Char(20): => error! '', hexadecimal value 0x14, is an invalid character. Line 5, position 7.
Char(21): => error! '', hexadecimal value 0x15, is an invalid character. Line 5, position 7.
Char(22): => error! '', hexadecimal value 0x16, is an invalid character. Line 5, position 7.
Char(23): => error! '', hexadecimal value 0x17, is an invalid character. Line 5, position 7.
Char(24): => error! '', hexadecimal value 0x18, is an invalid character. Line 5, position 7.
Char(25): => error! '', hexadecimal value 0x19, is an invalid character. Line 5, position 7.
Char(26): => error! '', hexadecimal value 0x1A, is an invalid character. Line 5, position 7.
Char(27): => error! '', hexadecimal value 0x1B, is an invalid character. Line 5, position 7.
Char(28): => error! '', hexadecimal value 0x1C, is an invalid character. Line 5, position 7.
Char(29): => error! '', hexadecimal value 0x1D, is an invalid character. Line 5, position 7.
Char(30): => error! '', hexadecimal value 0x1E, is an invalid character. Line 5, position 7.
Char(31): => error! '', hexadecimal value 0x1F, is an invalid character. Line 5, position 7.
Char(32):   => fine!
Char(33): ! => fine!
Char(34): " => fine!
Char(35): # => fine!
Char(36): $ => fine!
Char(37): % => fine!
Char(38): => error! An error occurred while parsing EntityName. Line 5, position 8.
Char(39): ' => fine!
Char(40): ( => fine!
Char(41): ) => fine!
Char(42): * => fine!
Char(43): + => fine!
Char(44): , => fine!
Char(45): - => fine!
Char(46): . => fine!
Char(47): / => fine!
Char(48): 0 => fine!
Char(49): 1 => fine!
Char(50): 2 => fine!
Char(51): 3 => fine!
Char(52): 4 => fine!
Char(53): 5 => fine!
Char(54): 6 => fine!
Char(55): 7 => fine!
Char(56): 8 => fine!
Char(57): 9 => fine!
Char(58): : => fine!
Char(59): ; => fine!
Char(60): => error! The '<' character, hexadecimal value 0x3C, cannot be included in a name. Line 5, position 13.
Char(61): = => fine!
Char(62): > => fine!
Char(63): ? => fine!
Char(64): @ => fine!
Char(65): A => fine!
Char(66): B => fine!
Char(67): C => fine!
Char(68): D => fine!
Char(69): E => fine!
Char(70): F => fine!
Char(71): G => fine!
Char(72): H => fine!
Char(73): I => fine!
Char(74): J => fine!
Char(75): K => fine!
Char(76): L => fine!
Char(77): M => fine!
Char(78): N => fine!
Char(79): O => fine!
Char(80): P => fine!
Char(81): Q => fine!
Char(82): R => fine!
Char(83): S => fine!
Char(84): T => fine!
Char(85): U => fine!
Char(86): V => fine!
Char(87): W => fine!
Char(88): X => fine!
Char(89): Y => fine!
Char(90): Z => fine!
Char(91): [ => fine!
Char(92): \ => fine!
Char(93): ] => fine!
Char(94): ^ => fine!
Char(95): _ => fine!
Char(96): ` => fine!
Char(97): a => fine!
Char(98): b => fine!
Char(99): c => fine!
Char(100): d => fine!
Char(101): e => fine!
Char(102): f => fine!
Char(103): g => fine!
Char(104): h => fine!
Char(105): i => fine!
Char(106): j => fine!
Char(107): k => fine!
Char(108): l => fine!
Char(109): m => fine!
Char(110): n => fine!
Char(111): o => fine!
Char(112): p => fine!
Char(113): q => fine!
Char(114): r => fine!
Char(115): s => fine!
Char(116): t => fine!
Char(117): u => fine!
Char(118): v => fine!
Char(119): w => fine!
Char(120): x => fine!
Char(121): y => fine!
Char(122): z => fine!
Char(123): { => fine!
Char(124): | => fine!
Char(125): } => fine!
Char(126): ~ => fine!
Char(127):  => fine!  

vous pouvez voir qu'il y a beaucoup de symboles qui ne peuvent pas être en code XML. Pour les remplacer, Nous pouvons utiliser Reqex.Remplacer

static string ReplaceHexadecimalSymbols(string txt)
{
    string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";
    return Regex.Replace(txt, r,"",RegexOptions.Compiled);
}

PS. Désolé si tout le monde le savait.

47
répondu Duke 2016-05-20 07:16:54

le spécification XML définit les caractères valides comme ceci:

Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]

comme vous pouvez le voir, #x12 n'est pas un caractère valide dans un document XML.

demandez-Vous comment les supprimer mais je pense que ce n'est pas la question que vous devriez poser. Ils devraient tout simplement pas être présent. Vous devez rejeter un tel document mal formé. Simplement supprimer des caractères invalides supprime le vrai problème.

si vous créez les documents en question, vous devez corriger le code qui le génère afin qu'il génère du XML valide.

12
répondu David Heffernan 2014-01-10 19:57:40

je pense que x26 "&" est un caractère valide et il pourrait être de sérialisé en XML.

donc pour remplacer le caractère illégal, nous devrions utiliser:

// Replace illegal character in XML documents with blank
// See here for reference http://www.w3.org/TR/xml/#charsets
var regex = "[\x00-\x08\x0B\x0C\x0E-\x1F]";
xml = Regex.Replace(xml, r, String.Empty, RegexOptions.Compiled);
6
répondu Xudong Fei 2015-01-14 08:41:34

il s'agit essentiellement d'un cas spécial de cette question . Je suggère que vous utilisiez une des réponses de là-bas.

0
répondu Brian Reischl 2017-05-23 10:31:34

il suffit de mettre à jour ces fonctions avec la correction mentionnée fournie par jhon et où vous devez mettre à jour vérifier ces fonctions dans votre code. il va travailler pour vous j'ai testé.

  private static void WriteDataTableToExcelWorksheet(DataTable dt, WorksheetPart worksheetPart)
    {
        var worksheet = worksheetPart.Worksheet;
        var sheetData = worksheet.GetFirstChild<SheetData>();

        string cellValue = "";

        //  Create a Header Row in our Excel file, containing one header for each Column of data in our DataTable.
        //
        //  We'll also create an array, showing which type each column of data is (Text or Numeric), so when we come to write the actual
        //  cells of data, we'll know if to write Text values or Numeric cell values.
        int numberOfColumns = dt.Columns.Count;
        bool[] IsNumericColumn = new bool[numberOfColumns];

        string[] excelColumnNames = new string[numberOfColumns];
        for (int n = 0; n < numberOfColumns; n++)
            excelColumnNames[n] = GetExcelColumnName(n);

        //
        //  Create the Header row in our Excel Worksheet
        //
        uint rowIndex = 1;

        var headerRow = new Row { RowIndex = rowIndex };  // add a row at the top of spreadsheet
        sheetData.Append(headerRow);

        for (int colInx = 0; colInx < numberOfColumns; colInx++)
        {
            DataColumn col = dt.Columns[colInx];
            AppendTextCell(excelColumnNames[colInx] + "1", col.ColumnName, headerRow);
            IsNumericColumn[colInx] = (col.DataType.FullName == "System.Decimal") || (col.DataType.FullName == "System.Int32");
        }

        //
        //  Now, step through each row of data in our DataTable...
        //
        double cellNumericValue = 0;
        foreach (DataRow dr in dt.Rows)
        {
            // ...create a new row, and append a set of this row's data to it.
            ++rowIndex;
            var newExcelRow = new Row { RowIndex = rowIndex };  // add a row at the top of spreadsheet
            sheetData.Append(newExcelRow);

            for (int colInx = 0; colInx < numberOfColumns; colInx++)
            {
                cellValue = dr.ItemArray[colInx].ToString();

                // Create cell with data
                if (IsNumericColumn[colInx])
                {
                    //  For numeric cells, make sure our input data IS a number, then write it out to the Excel file.
                    //  If this numeric value is NULL, then don't write anything to the Excel file.
                    cellNumericValue = 0;
                    if (double.TryParse(cellValue, out cellNumericValue))
                    {
                        cellValue = ReplaceHexadecimalSymbols(cellNumericValue.ToString());
                        AppendNumericCell(excelColumnNames[colInx] + rowIndex.ToString(), cellValue, newExcelRow);
                    }
                }
                else
                {
                    //  For text cells, just write the input data straight out to the Excel file.
                    AppendTextCell(excelColumnNames[colInx] + rowIndex.ToString(), cellValue, newExcelRow);
                }
            }
        }
    }
    static string ReplaceHexadecimalSymbols(string txt)
    {
        string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";
        return Regex.Replace(txt, r, "", RegexOptions.Compiled);
    }

    private static void AppendTextCell(string cellReference, string cellStringValue, Row excelRow)
    {
        //  Add a new Excel Cell to our Row 
        Cell cell = new Cell() { CellReference = cellReference, DataType = CellValues.String };
        CellValue cellValue = new CellValue();
        cellValue.Text = ReplaceHexadecimalSymbols(cellStringValue);
        cell.Append(cellValue);
        excelRow.Append(cell);
    }

    private static void AppendNumericCell(string cellReference, string cellStringValue, Row excelRow)
    {
        //  Add a new Excel Cell to our Row 
        Cell cell = new Cell() { CellReference = cellReference };
        CellValue cellValue = new CellValue();
        cellValue.Text = ReplaceHexadecimalSymbols(cellStringValue);
        cell.Append(cellValue);
        excelRow.Append(cell);
    }

Merci faites-moi savoir si vous avez besoin d'aide.

0
répondu Noor Rahman 2014-10-31 11:44:12

la solution Regex fonctionne assez rapidement, même sur un document XML de 100 Mo.

la chaîne d'expression suivante ferait le travail.

"[\x00-\x08\x0B\x0C\x0E-\x1F]"
0
répondu LysanderM 2015-06-10 11:39:23