I'm testing a data anonymizer that I found on github. But I want to add a new entity, because the author of the repository only uses ["PERSON", "EMAIL_ADDRESS", "LOCATION", "PHONE_NUMBER"]
as entities. So based on the presidio documentation I add the method add_entity
into PresidioHandler class:
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import yaml
LANGUAGES = ["en", "es"]
ENTITIES = ["PERSON","EMAIL_ADDRESS","LOCATION","PHONE_NUMBER"]
NLP_ENGINE="spacy"
LANGUAGE_LABELS_PATH = "i18n/"
NLP_MODELS=[
{"lang_code": "en", "model_name": "en_core_web_lg"},
{"lang_code": "es", "model_name": "es_core_news_lg"},
]
class PresidioHandler:
def __init__(self):
self.languages = LANGUAGES
self.entities = ENTITIES
self.entities_language_labels = self._build_entities_language_labels(LANGUAGE_LABELS_PATH)
self.analyzer = AnalyzerEngine(
nlp_engine=NlpEngineProvider(
nlp_configuration={
"nlp_engine_name": NLP_ENGINE,
"models": NLP_MODELS}
).create_engine(),
supported_languages=self.languages
)
self.anonymizer = AnonymizerEngine()
self.operators = {entity: OperatorConfig("hash") for entity in self.entities}
def _build_entities_language_labels(self, language_labels_path):
return {
entity: {
lang: self._load_language_labels(language_labels_path, lang)[entity]
for lang in self.languages
}
for entity in self.entities
}
def _load_language_labels(self, language_labels_path, language):
with open(f'{language_labels_path}/{language}.yaml', 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
@staticmethod
def _int_to_alphabet(num):
if num < 26:
return chr(65 + num)
elif num < 52:
return chr(97 + num - 26)
else:
return chr(48 + num - 52)
def _create_entities_hash_dict(self, items, language):
entities_hash_dict = {}
for entity in self.entities:
order = 0
for item in items:
if item.entity_type == entity:
if item.text not in entities_hash_dict:
entities_hash_dict[item.text] = {"replace": self.entities_language_labels[entity][language] + "_" + self._int_to_alphabet(order), "count": 1}
order += 1
else:
entities_hash_dict[item.text]["count"] += 1
return entities_hash_dict
@staticmethod
def _replace_hash(text, entities_hash_dict):
for key, value in entities_hash_dict.items():
text = text.replace(key, f"{value['replace']}")
return text
def anonymize_text(self, text, language):
anonymizer_result = self.anonymizer.anonymize(
text=text,
analyzer_results=self.analyzer.analyze(text=text, entities=self.entities, language=language),
operators=self.operators
)
items = anonymizer_result.items
items.sort(key=lambda x: x.start)
entities_hash_dict = self._create_entities_hash_dict(items=items, language=language)
return self._replace_hash(text=anonymizer_result.text, entities_hash_dict=entities_hash_dict)
def add_entity(self, entity_name, entity_list):
self.entities.append(entity_name)
entity_recognizer = PatternRecognizer(supported_entity=entity_name, deny_list=entity_list)
self.analyzer.registry.add_recognizer(entity_recognizer)
Also add in the yaml files the label for each language of the new entity.
I tested as follows:
from presidio_handler import PresidioHandler
if __name__ == '__main__':
presidio_handler = PresidioHandler()
presidio_handler.add_entity('COMPANY', ["Microsoft", "Apple"])
text = "Daniel y Alvaro trabajan en Microsoft y Apple"
language = "es"
result = presidio_handler.anonymize_text(text, language)
print(result)
When I test it in English everything works fine. But when I try in spanish I get:
Nombre_A y Nombre_B trabajan en Microsoft y Apple
The desired output is:
Nombre_A y Nombre_B trabajan en CompaƱia_A y CompaƱia_B
Does anyone know what I am doing wrong? Or what do you suggest me to create new entities? The problem is the model_name for spanish in NLP_MODELS?