-1

I have a very large file that has the following line multiples times in different places. There is no real organisation to this file. However the following line is repeated:

 <mcwb:Control ClassId="jdn8732d" ControlDisplayName="Type" ControlId="asp_type" DataFieldName="asp_type" IsSystemControl="False" IsUnbound="False" SystemStepType="0">

I need to extract the following:

1. ControlDisplayName="Type"
2. ControlId="asp_type" 
3. DataFieldName="asp_type"

I'm struggling to parse this file with C#, does anyone know how I can proceed?

sy-huss
  • 183
  • 2
  • 14
  • 1
    It looks like plain old XML to me, far from erratic. – Jeroen Mostert Mar 26 '21 at 10:30
  • Even in the absence of XML parsing (if for whatever reason the rest of the file isn't valid XML), this line looks like it could just be split by the space character. What's "erratic" about this data? What have you tried and what isn't working as expected? – David Mar 26 '21 at 10:38
  • post a full structure of the file, as mentioned this looks like XML https://stackoverflow.com/help/minimal-reproducible-example – kshkarin Mar 26 '21 at 10:39
  • The file has XML content, however it is the output of a processing job that has taken place. The file also contains JSON, so not a valid XML file. I was trying to modify this: https://stackoverflow.com/questions/13024073/regex-c-sharp-extract-text-within-double-quotes, however, I can't get the regex statement right – sy-huss Mar 26 '21 at 10:40

1 Answers1

0

A very simple implementation (there are better, more efficient, more robust, ways of doing the task) to get you started if you really cannot use an xml library to search with native xml functionality. Remember to add exception handling and special cases and additional validation where appropriate.

// usage
private static void Main(string[] args) {

    Searcher searcher = new Searcher();
    foreach (SearchExtract extract in searcher.GetOccurances(System.IO.File.ReadAllText(@"c:\temp\testfile.txt"))) {
        //do somethign with extract
    }
}

public class SearchExtract {

    // makes it easier to find in the document for checking, can even use to calculate lines
    public int StartIndex {
        get; set;
    }

    public string ControlDisplayName {
        get; set;
    }

    public string ControlId {
        get; set;
    }

    public string DataFieldName {
        get; set;
    }
}

// does not check for end of file 
public class Searcher {

    private const string Start = "<mcwb:Control "; // with the space 
    private const string End = ">";

    // use a stream as input and a stream reader (buffered) for large files
    public System.Collections.Generic.IEnumerable<SearchExtract> GetOccurances(string input) {

        int position = 0;
        int startIndex = input.IndexOf(Start, position, System.StringComparison.Ordinal);
        while (startIndex >= 0) {

            // first lets try find the end
            //  assuming the end string is never in a comment or included string literal value, else you need to accomodate for this 
            int endIndex = input.IndexOf(End, startIndex);

            // now the other values
            int controlDisplayNameIndex = input.IndexOf(nameof(SearchExtract.ControlDisplayName), startIndex, System.StringComparison.Ordinal);
            int controlIdIndex = input.IndexOf(nameof(SearchExtract.ControlId), startIndex, System.StringComparison.Ordinal);
            int dataFieldNameIndex = input.IndexOf(nameof(SearchExtract.DataFieldName), startIndex, System.StringComparison.Ordinal);

            // now make sure the values are all before the end character
            //  here the assumption is you need all for it to be a valid search extract
            //  and that every start has an end .. ie not malformed file in this regard
            if (controlDisplayNameIndex < endIndex && controlIdIndex < endIndex && dataFieldNameIndex < endIndex) {

                string controlDisplayNameValue = FindValue(controlDisplayNameIndex + nameof(SearchExtract.ControlDisplayName).Length, input);
                string controlIdValue = FindValue(controlIdIndex + nameof(SearchExtract.ControlId).Length, input);
                string dataFieldNameValue = FindValue(dataFieldNameIndex + nameof(SearchExtract.DataFieldName).Length, input);

                // validate
                if (controlDisplayNameValue != null && controlIdValue != null && dataFieldNameValue != null) {

                    // iterate
                    yield return new SearchExtract() {
                        StartIndex = startIndex,
                        ControlDisplayName = controlDisplayNameValue,
                        ControlId = controlIdValue,
                        DataFieldName = dataFieldNameValue
                    };
                }
            }

            startIndex = input.IndexOf(Start, endIndex + 1, System.StringComparison.Ordinal);
        }
    }

    // introduce some additional terminal checks .. such as maximum value length 
    private string FindValue(int index, string input) {

        const string ValueDelimiter = "\"";

        string value = null;

        int startIndex = input.IndexOf(ValueDelimiter, index, System.StringComparison.Ordinal);

        int endIndex = -1;
        if (startIndex >= 0) {
            endIndex = input.IndexOf(ValueDelimiter, startIndex + 1, System.StringComparison.Ordinal);
        }

        if (endIndex > 0) {
            value = input.Substring(startIndex + 1, endIndex - startIndex - 1);

            // do some validation on value 
        }

        return value;
    }
}
MaLio
  • 2,498
  • 16
  • 23