0

I am using AmazonTextract .NET SDK to extract texts from images. Its returning list of blocks as part of the response. I need to extract key-value pair out of extracted texts. I guess we need to iterate through the list of block, check for KEY_VALUE_SET

Is my understanding right? Can someone give me a piece of code which would give me key value pair after text extraction.

My sample code:

    var DocRequest = new AnalyzeDocumentRequest()
    {
       Document = MyDocument,
       FeatureTypes = new List<string> { Amazon.Textract.FeatureType.FORMS, Amazon.Textract.FeatureType.TABLES }
    };
    var response = client.AnalyzeDocumentAsync(DocRequest);
halfer
  • 19,824
  • 17
  • 99
  • 186

4 Answers4

1

This below code is used to populate the map. AnalyzeResult class is just holder for map of key and values and also list of pages.

       List<Block> blocks = result.getBlocks();
        for (Block block : blocks) {
            String blockId = block.getId();
            analyzeResult.blockMap.put(blockId, block);
            String blockType = block.getBlockType();
            switch(blockType){
                case AppConstant.BLOCK_PAGE :
                    page = new ArrayList<TextLine>();
                    analyzeResult.pages.add(page);
                    break;
                case AppConstant.BLOCK_KEY_VALUE_SET :
                    if (block.getEntityTypes().contains(AppConstant.BLOCK_KEY)) {
                        analyzeResult.keyMap.put(blockId, block);
                    }
                    else {
                        analyzeResult.valueMap.put(blockId, block);
                    }
                    break;
                
            }
        }


        paginationToken = result.getNextToken();
        if (paginationToken == null) {
            finished = true;
        }
    }

Below functions can be used to get relationship and find key value pairs

public List getKVRelationShip(AnalyzeResult analyzeResult) {

    List listOfFormFields = new ArrayList<FormInfo>();
    final Set<Map.Entry<String, Block>> entries = analyzeResult.keyMap.entrySet();
    for (Map.Entry<String, Block> entry : entries) {
        Block keyBlock = entry.getValue();
        Block valueBlock = this.findValueBlock(keyBlock, analyzeResult.valueMap);
        if(valueBlock != null){
            String key = getText(keyBlock, analyzeResult.blockMap);
            String val = getText(valueBlock, analyzeResult.blockMap);
            key = key != null ? key.trim() : "";
            val = val != null ? val.trim() : "";
            FormInfo formInfo = new FormInfo(key, val, keyBlock.getPage(),
                    keyBlock.getGeometry().getBoundingBox().getTop(), keyBlock.getConfidence());
            listOfFormFields.add(formInfo);
        }
    }
    Collections.sort(listOfFormFields);
    return listOfFormFields;
}

public String getText(Block results, Map blockMap) {
    String text = "";
    if (results.getRelationships() != null  && results.getRelationships().size() != 0) {
        for (Relationship relationship : results.getRelationships()) {
            if (relationship.getType().equals(AppConstant.BLOCK_CHILD)) {
                for (String childId : relationship.getIds()) {
                    Block word = (Block) blockMap.get(childId);
                    if (word.getBlockType().equals(AppConstant.BLOCK_WORD)) {
                        text = text + word.getText() + " ";
                    }
                    if (word.getBlockType().equals(AppConstant.BLOCK_SELECTION_ELEMENT)) {
                        if (word.getSelectionStatus().equals(AppConstant.BLOCK_SELECTED)) {
                            text = text + "X";
                        }
                    }
                }
            }
        }
    }
    return text;
}

private Block findValueBlock(Block block, Map valueMap) {
    Block valueBlock = null;
    for (Relationship relationship : block.getRelationships()) {
        if (relationship.getType().equals(AppConstant.BLOCK_VALUE)) {
            for (String valueId : relationship.getIds()) {
                valueBlock = (Block) valueMap.get(valueId);
            }
        }
    }
    return valueBlock;
}
arvind
  • 106
  • 7
1

If you need a Javascript version, i wrote the following.

function items(obj) {
  var i,
    arr = [];
  for (i in obj) {
    arr.push(obj[i]);
  }
  return arr;
}

function findValueBlock(keyBlock, valueMap){
  let valueBlock;
  for (const relationship in keyBlock["Relationships"]) {
    const relationShipArray = keyBlock["Relationships"][relationship];
    if (relationShipArray["Type"] === "VALUE") {
      for (const valueId in relationShipArray["Ids"]) {
        valueBlock = valueMap[relationShipArray["Ids"][valueId]];
      }
    }
  }
  return valueBlock;
};

function getText(result, blockMap) {
  let text = "";
  if (result.Relationships) {
    for (const relationship in result["Relationships"]) {
      const relationShipArray = result["Relationships"][relationship];
      if (relationShipArray["Type"] === "CHILD") {
        for (const childId in relationShipArray["Ids"]) {
          const word = blockMap[relationShipArray["Ids"][childId]];
          if (word["BlockType"] === "WORD") {
            text += word["Text"] + " ";
          }
          if (word["BlockType"] === "SELECTION_ELEMENT") {
            if (word["SelectionStatus"] === "SELECTED") {
              text += "X ";
            }
          }
        }
      }
    }
  }
  return text;
};

function getRelationships (keyMap, valueMap, blockMap) {
  const kvs = [];
  const itemsKeyMap = items(keyMap);
  for (const key_block in itemsKeyMap) {
    const value_block = findValueBlock(itemsKeyMap[key_block], valueMap);
    key = getText(itemsKeyMap[key_block], blockMap);
    val = getText(value_block, blockMap);
    kvs[key] = val;
  }
  return kvs;
};

function extractKeyValue(response) {
  const key_map = {};
  const value_map = {};
  const block_map = {};

  response.Blocks.forEach((block) => {
    let block_id = block["Id"];
    block_map[block_id] = block;
    if (block["BlockType"] === "KEY_VALUE_SET") {
      if (block["EntityTypes"][0] === "KEY") {
        key_map[block_id] = block;
      } else {
        value_map[block_id] = block;
      }
    }
  });
  const kvs = getRelationships(key_map, value_map, block_map);
  console.log("kvs", kvs); 
/*
["key":"value",...,"key":"value"]
*/
};

 extractKeyValue(responseFromTextract);

the code can improve

0

AWS provides a sample python code for key value mapping in their documentation. It is not terribly complicated. You can try to understand the logic behind the python code and then implement it in your .NET project.

Here's the mapping code: https://docs.aws.amazon.com/textract/latest/dg/examples-extract-kvp.html

Ninad Gaikwad
  • 4,272
  • 2
  • 13
  • 23
0
Here is the code for key and pair textract using PHP
<?php
use Aws\Textract\TextractClient;
require_once __DIR__.'/awscli/vendor/autoload.php';
function ParserFile($filename)
{

        $Val=get_kv_map($filename);
        $key_map=$Val['key_map'];
        $value_map=$Val['value_map'];
        $block_map=$Val['block_map'];
        $kvs = get_kv_relationship($key_map, $value_map, $block_map);
        print_r($kvs)   ;die;

}

function get_kv_relationship($key_map, $value_map, $block_map)
{
        foreach($key_map as $block_id=>$key_block)
        {
                $value_block = find_value_block($key_block, $value_map);
                $key = get_text($key_block, $block_map);
                $val = get_text($value_block, $block_map);
                $kvs[$key]=$val;
        }
        return $kvs;

}

function find_value_block($key_block, $value_map)
{
        foreach($key_block['Relationships'] as $relationship)
        {
                if($relationship['Type'] == 'VALUE')
                {
                        foreach($relationship['Ids'] as  $value_id )
                        {
                                $value_block=$value_map[$value_id];
                                return $value_block;
                        }

                }
        }

}
function get_text($result, $blocks_map)
{
        $text='';
        if(array_key_exists('Relationships',$result))
        {
                foreach($result['Relationships'] as $relationship)
                {
                        if($relationship['Type'] == 'CHILD')
                        {
                                foreach($relationship['Ids'] as $child_id)
                                {

                                        $word = $blocks_map[$child_id];
                                        if($word['BlockType'] == 'WORD')
                                        {
                                                $text = $word['Text'] ;
                                        }
                                        if($word['BlockType'] == 'SELECTION_ELEMENT')
                                        {
                                                if($word['SelectionStatus'] == 'SELECTED')
                                                {
                                                                $text = 'X ';
                                                }
                                        }


                                }

                        }
                }


        }

        return $text;
}
function get_kv_map($filename)
{
        $client = new TextractClient([
                'region'  => $ClientRegion,
                'version' => $ClientVersion,
                'credentials' => [
                        'key'    => $Accesskey,
                        'secret' => $EncryptedKey,
                ]
        ]);
        $file = fopen($filename, "rb");
        $contents = fread($file, filesize($filename));
        fclose($file);
        fclose($file);
        $options = [
                'Document' => [
                        'Bytes' => $contents
                ],
                'FeatureTypes' => ['FORMS', 'TABLES'], // REQUIRED
        ];
        $result = $client->analyzeDocument($options);

        $blocks=$result['Blocks'];

        $key_map=[];
        $value_map = [];
        $block_map = [];
        foreach ($blocks as $block)
        {
                $block_id = $block['Id'];
                        $block_map[$block_id] = $block;
                if ($block['BlockType'] == "KEY_VALUE_SET")
                {
                                if(in_array('KEY',$block['EntityTypes']))
                                {
                                        $key_map[$block_id] = $block;
                                }
                                else
                                {
                                        $value_map[$block_id] = $block;
                                }
                }
        }

        return array('key_map'=>$key_map,'value_map' => $value_map,'block_map' => $block_map);

}

$filename='imagedire.png';
ParserFile($filename);
  • Remember that Stack Overflow isn't just intended to solve the immediate problem, but also to help future readers find solutions to similar problems, which requires understanding the underlying code. This is especially important for members of our community who are beginners, and not familiar with the syntax. Given that, **can you [edit] your answer to include an explanation of what you're doing** and why you believe it is the best approach? – Jeremy Caney Apr 07 '23 at 00:30