To identify the tokens ("IN", "=", "LIKE") I have used the get_tokens() which recursively get the tokens added to the extracted_key() dictionary. Here in the extracted_keys dict the key address stored instead of the actual key Since the functions is going recursively. So I have used create_dict() another function to create a proper dictionary out of extracted_keys dictionary. For now we are able to extract columns along with their conditions from the SQL WHERE conditon for following operators i.e., "IN" operator, "=" operator, "Like" operator.
To install sqlparse module
pip install sqlparse
import sqlparse
def get_tokens(where):
identifier = None
extracted_keys = {}
for i in where.tokens:
try:
name = i.get_real_name()
if name and isinstance(i, sqlparse.sql.Identifier):
name = i.get_real_name()
identifier = i
extracted_keys[identifier] = i.value
elif identifier and isinstance(i, sqlparse.sql.Parenthesis):
extracted_keys[identifier] = i.value
elif i and "in" not in i.value.lower() and "or" not in i.value.lower() and isinstance(i, sqlparse.sql.Comparison):
if "=" in i.value:
equal_index = i.value.index("=")
if i.value[equal_index -1] not in ["!", ">", "<"]:
key,value = i.value.split("=")[0], i.value.split("=")[1]
extracted_keys[key] = value
if "!=" in i.value:
key,value = i.value.split("!=")[0], i.value.split("!=")[1]
extracted_keys[key] = value
elif "like" in i.value.lower():
key,value = i.value.lower().split("like")[0].upper(), i.value.lower().split("like")[1]
extracted_keys[key] = value
else:
extracted_keys.update(get_tokens(i))
except Exception as error:
pass
return extracted_keys
def create_dict(extracted_keys):
cols = {}
for key, value in extracted_keys.items():
try:
if key.get_real_name():
cols[key.get_real_name()] = value
except Exception as e:
cols[key] = value
return cols
For example:
query = "SELECT COUNT(*) as cnt FROM dbo.main_table WHERE (VKRKA10001920015 = 1 AND age IN (15, 16, 17, 18, 19) AND income IN (101, 102, 301, 302, 305))"
parsed = sqlparse.parse(query)
where = parsed[0][-1]
extracted_keys = get_tokens(where)
extracted_keys = create_dict(extracted_keys)
Output
{
'VKRKA10001920015': 1,
'age' : (15, 16, 17, 18, 19),
'income' : (101, 102, 301, 302, 305),
}