1

My program currently loops through a directory of pdf/image files and generates json files using the Azure computer vision REST API.

Besides generating the JSON files, I'd like to extract certain elements from the JSON generated, such as text and confidence and write them to a csv file, so that it would look somethin glike this:

e.g. desired csv file output:

filename     page    text                  words        confidence
file1.pdf    1       the quick brown fox                0.98
file1.pdf    2       the color is great!                0.71
file2.jpeg   1       something something                0.87 

However, this is a sample csv output of what im getting currently (goes all the way up to FJO column so impossible to paste all of the columns here):

output sample

How do i make the JsonToCsv() function return the filename, page, text, words, and confidence elements only to the csv file for each file?

here is some of an example JSON generated from one of the files:

{
  "status": "succeeded",
  "createdDateTime": "2020-05-28T05:13:21Z",
  "lastUpdatedDateTime": "2020-05-28T05:13:22Z",
  "analyzeResult": {
    "version": "3.1.0",
    "readResults": [
      {
        "page": 1,
        "language": "en",
        "angle": 0.8551,
        "width": 2661,
        "height": 1901,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              67,
              646,
              2582,
              713,
              2580,
              876,
              67,
              821
            ],
            "text": "The quick brown fox jumps",
            "words": [
              {
                "boundingBox": [
                  143,
                  650,
                  435,
                  661,
                  436,
                  823,
                  144,
                  824
                ],
                "text": "The",
                "confidence": 0.958
              },
              {
                "boundingBox": [  

full code:

static void Main(string[] args)
{
    foreach (var file in Directory.GetFiles(inputFilesPath))
    {
        string inputFile = Path.GetFileName(file); // Return File Name + Extension

        // Call the REST API method.
        Console.WriteLine("\nExtracting text from: " + inputFile + "\n");
        ReadText(file, jsonFilesPath).Wait();
        
    }
    
    //Output to CSV
    foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
    {
        string csvFile = Path.GetFileName(csvFilePath); // Return File Name + Extension

        // Call the REST API method.
        Console.WriteLine("\nWriting/Parsing json to: " + csvFile + " file from: " + jsonFile + "\n");
        JsonToCsv(jsonFile, csvFilePath);
    }
}

public static void JsonToCsv(string jsonInputFile, string csvFile)
{
    // Tools > Nuget Package Manager > Package Manager Console
    // Install-Package ChoETL.JSON
    using (var r = new ChoJSONReader(jsonInputFile))
    {
        using (var w = new ChoCSVWriter(csvFile).WithFirstLineHeader())
        {
            w.Write(r);
        }
    }
}

static async Task ReadText(string inputFilePath, string outputFilePath)
{
    try
    {
        HttpClient client = new HttpClient();

        // Request headers.
        client.DefaultRequestHeaders.Add(
            "Ocp-Apim-Subscription-Key", subscriptionKey);

        string url = uriBase;

        HttpResponseMessage response;
        string operationLocation;

        // Reads the contents of the specified local input
        // into a byte array.
        byte[] byteData = GetInputAsByteArray(inputFilePath);

        // Adds the byte array as an octet stream to the request body.
        using (ByteArrayContent content = new ByteArrayContent(byteData))
        {
            content.Headers.ContentType =
                new MediaTypeHeaderValue("application/octet-stream");

            response = await client.PostAsync(url, content);
        }

        if (response.IsSuccessStatusCode)
            operationLocation =
                response.Headers.GetValues("Operation-Location").FirstOrDefault();
        else
        {
            // Display the JSON error data.
            string errorString = await response.Content.ReadAsStringAsync();
            Console.WriteLine("\n\nResponse:\n{0}\n",
                JToken.Parse(errorString).ToString());
            return;
        }

        string contentString;
        int i = 0;
        do
        {
            System.Threading.Thread.Sleep(1000);
            response = await client.GetAsync(operationLocation);
            contentString = await response.Content.ReadAsStringAsync();
            ++i;
        }
        while (i < 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1);

        if (i == 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1)
        {
            Console.WriteLine("\nTimeout error.\n");
            return;
        }

        // Display the JSON response.
        Console.WriteLine("\nResponse:\n\n{0}\n",
            JToken.Parse(contentString).ToString());

        // Write JSON response to file
        string inputFileName = Path.GetFileNameWithoutExtension(inputFilePath); // Extract File Name only
        var responseData = JToken.Parse(contentString).ToString(); //Fetch Data and assign it for file output
        string jsonOutputFile = Path.Combine(outputFilePath, inputFileName + ".json");
        File.WriteAllText(jsonOutputFile, responseData);
    }
    catch (Exception e)
    {
        Console.WriteLine("\n" + e.Message);
    }
}
Cinchoo
  • 6,088
  • 2
  • 19
  • 34
Cataster
  • 3,081
  • 5
  • 32
  • 79
  • i did this same task with the Google Vision API. basically you will want just the "words" node of the json result. see this for a start. https://bitbucket.org/MuseumOfModernArt/vision-api/src/master/ – smoore4 Oct 19 '20 at 18:42
  • @smoore4 interesting! the UI idea is nice actually. this gives me an idea/question: could you generate an object (e.g. image) from the JSON file to compare it visually that way if it actually extracted the text precisely? in other words, "reverse-generate" an object – Cataster Oct 19 '20 at 18:48
  • hmm...in theory yes. let's say it is a horse. you would need type, color, age, position (front/back, sides, etc). i'm sure google could then find a corresponding image that fits those criteria. if that's what you mean. i dont believe you could create a new image tho – smoore4 Oct 19 '20 at 18:54
  • @smoore4 i see. thats probably beyond my capability hahaha. I thought your AFTER images were doing that pretty much but it may be just displaying the info from the database for each image when you hover on the image. – Cataster Oct 19 '20 at 18:55

1 Answers1

1

Guess this may point you to right direction to generate expected CSV from JSON

StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON FILE ***")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
            {
                r1.page,
                r2.text,
                words = r3.text,
                r3.confidence
            }))));
    }
}

Console.WriteLine(csv.ToString());

Output:

page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41

UPDATE:

Use: ChoETL.JSON v1.2.1.2

To generate CSV with spaced field names, use WithField to specify the field header for each field.

StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("sample43.json")
    .WithJSONPath("$..readResults")
    )
{
    using (var w = new ChoCSVWriter(csv)
        .WithField("FileName", fieldName: "File Name")
        .WithField("page")
        .WithField("text")
        .WithField("words")
        .WithField("confidence")
        .WithFirstLineHeader()
        )
    {
        w.Write(p
            .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
            {
                FileName = "file.json",
                r1.page,
                r2.text,
                words = r3.text,
                r3.confidence
            }))));
    }
}

Console.WriteLine(csv.ToString());

Output:

File Name,page,text,words,confidence
file.json,1,The quick brown fox jumps,The,0.958
file.json,1,The quick brown fox jumps,quick,0.57
file.json,1,The quick brown fox jumps,brown,0.799
file.json,1,The quick brown fox jumps,fox,0.442
file.json,1,The quick brown fox jumps,jumps,0.878
file.json,1,over,over,0.37
file.json,1,the lazy dog!,the,0.909
file.json,1,the lazy dog!,lazy,0.853
file.json,1,the lazy dog!,dog!,0.41
Cinchoo
  • 6,088
  • 2
  • 19
  • 34
  • Interesting! if i want to include the filename, would i do it like this? `string inputFile = Path.GetFileName(jsonInputFile);` `...w.Write("File Name" = inputFile, p.SelectMany....` – Cataster Oct 21 '20 at 20:00
  • where is the `filename` in the JSON? I can't seem to find it, thats why I didn't include it. – Cinchoo Oct 23 '20 at 12:20
  • oh it's not part of the json. Just a separate variable. It's basically a nice additional column to specify where this extraction is coming from, since I plan to store multiple files information. – Cataster Oct 23 '20 at 13:54
  • well, in that case, you just add it in anonymous type in the `select` – Cinchoo Oct 23 '20 at 14:24
  • yep, for some reason Visual Studio was showing error when I was doing that so i thought the Select statement is only reserved to whatever is in the dynamic[]. It may have been a visual glitch. btw, a column name like `FileName = "file.json"` works, `Words`, etc... but if i want space between the words, for example, `"File Name" = "file.json"` Visual studio complains `Invalid anonymous type member declarator`. why doesnt that naming work inside the Select? – Cataster Oct 23 '20 at 14:53
  • that is expected behavior, you can't have space in them. – Cinchoo Oct 23 '20 at 15:13
  • Oh i see. is there a way to circumvent this? is it just a limitation with ChoCSVWriter? – Cataster Oct 23 '20 at 15:28
  • 1
    Thank you very much for also specifying the `ChoETL.JSON v1.2.1.2` version! I had to uninstall the current old version I had and when i reinstalled `Install-Package ChoETL.JSON -Version 1.2.1.2` it worked :) – Cataster Oct 23 '20 at 19:00