0

I'm testing a data anonymizer that I found on github. But I want to add a new entity, because the author of the repository only uses ["PERSON", "EMAIL_ADDRESS", "LOCATION", "PHONE_NUMBER"] as entities. So based on the presidio documentation I add the method add_entity into PresidioHandler class:

from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import yaml

LANGUAGES = ["en", "es"]
ENTITIES = ["PERSON","EMAIL_ADDRESS","LOCATION","PHONE_NUMBER"]
NLP_ENGINE="spacy"
LANGUAGE_LABELS_PATH = "i18n/"
NLP_MODELS=[
    {"lang_code": "en", "model_name": "en_core_web_lg"},
    {"lang_code": "es", "model_name": "es_core_news_lg"},
]

class PresidioHandler:
    def __init__(self):
        self.languages = LANGUAGES
        self.entities = ENTITIES
        self.entities_language_labels = self._build_entities_language_labels(LANGUAGE_LABELS_PATH)

        self.analyzer = AnalyzerEngine(
            nlp_engine=NlpEngineProvider(
                nlp_configuration={
                    "nlp_engine_name": NLP_ENGINE,
                    "models": NLP_MODELS}
            ).create_engine(),
            supported_languages=self.languages
        )
        self.anonymizer = AnonymizerEngine()
        self.operators = {entity: OperatorConfig("hash") for entity in self.entities}

    def _build_entities_language_labels(self, language_labels_path):
        return {
            entity: {
                lang: self._load_language_labels(language_labels_path, lang)[entity] 
                for lang in self.languages
            } 
            for entity in self.entities
        }

    def _load_language_labels(self, language_labels_path, language):
        with open(f'{language_labels_path}/{language}.yaml', 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)

    @staticmethod
    def _int_to_alphabet(num):
        if num < 26:
            return chr(65 + num)
        elif num < 52:
            return chr(97 + num - 26)
        else:
            return chr(48 + num - 52)

    def _create_entities_hash_dict(self, items, language):
        entities_hash_dict = {}
        for entity in self.entities:
            order = 0
            for item in items:
                if item.entity_type == entity:
                    if item.text not in entities_hash_dict:
                        entities_hash_dict[item.text] = {"replace": self.entities_language_labels[entity][language] + "_" + self._int_to_alphabet(order), "count": 1}
                        order += 1
                    else:
                        entities_hash_dict[item.text]["count"] += 1
        return entities_hash_dict

    @staticmethod
    def _replace_hash(text, entities_hash_dict):
        for key, value in entities_hash_dict.items():
            text = text.replace(key, f"{value['replace']}")
        return text

    def anonymize_text(self, text, language):
        anonymizer_result = self.anonymizer.anonymize(
            text=text,
            analyzer_results=self.analyzer.analyze(text=text, entities=self.entities, language=language),
            operators=self.operators
        )

        items = anonymizer_result.items
        items.sort(key=lambda x: x.start)
        entities_hash_dict = self._create_entities_hash_dict(items=items, language=language)

        return self._replace_hash(text=anonymizer_result.text, entities_hash_dict=entities_hash_dict)
    
    def add_entity(self, entity_name, entity_list):
        self.entities.append(entity_name)
        entity_recognizer = PatternRecognizer(supported_entity=entity_name, deny_list=entity_list)
        self.analyzer.registry.add_recognizer(entity_recognizer)

Also add in the yaml files the label for each language of the new entity. enter image description here I tested as follows:

from presidio_handler import PresidioHandler

if __name__ == '__main__':
    presidio_handler = PresidioHandler()
    presidio_handler.add_entity('COMPANY', ["Microsoft", "Apple"])
    text = "Daniel y Alvaro trabajan en Microsoft y Apple"
    language = "es"
    result = presidio_handler.anonymize_text(text, language)
    print(result)

When I test it in English everything works fine. But when I try in spanish I get:

Nombre_A y Nombre_B trabajan en Microsoft y Apple

The desired output is:

Nombre_A y Nombre_B trabajan en CompaƱia_A y CompaƱia_B

Does anyone know what I am doing wrong? Or what do you suggest me to create new entities? The problem is the model_name for spanish in NLP_MODELS?

suribe06
  • 69
  • 8

0 Answers0