I have a table in DB with almost 20 million records. I want to search through regex. It was all good when the number of records were about 100 thousand. But now it takes like quite a lot time even sometime it results in timeout. Does I need to migrate to SQL database may be postgresql
or something like elastic search. As the records in this table is expected to increase more than 20 billion. Can there is a way to make it efficient by keeping the same settings as I'm using djongo
to connect django
to mongodb
or I have to use any other database for fast searching.
My model schema is
from djongo import models as model
class User(model.Model):
email = model.CharField(max_length=50, default='')
source = model.CharField(default='unknown',max_length=150)
username = model.CharField(max_length=150, default='')
hash = model.CharField(max_length=255, default='')
salt = model.CharField(max_length=255, default='')
ipaddress = model.CharField(max_length=50,default='')
lastipaddress = model.CharField(max_length=50,default='')
name = model.CharField(max_length=150, default='')
dateofbirth = model.CharField(max_length=100, default='')
phonenumber = model.CharField(max_length=100, default='')
firstname = model.CharField(max_length=150, default='')
lastname = model.CharField(max_length=150, default='')
address = model.CharField(max_length=255, default='')
objects = model.DjongoManager()
This method is called when an post request is send to django
@api_view(['POST'])
@authentication_classes([authentication.TokenAuthentication])
@permission_classes([permissions.IsAdminUser])
def search(request):
if 'username' in request.data:
username = request.data['username']
if 'email' in request.data:
useremail = request.data['email']
if 'userid' in request.data:
userid = request.data['userid']
if 'query' in request.data:
query = request.data['query']
else:
return Response(status.HTTP_400_BAD_REQUEST)
obj = {}
obj['query'] = query
obj['type'] = type
obj['wildcard'] = wildcard
obj['regex'] = regex
if not (type in ['email', 'domain', 'username'] and wildcard == 'false' and regex == 'false'):
obj['request'] = request
final = []
print('wildcard', wildcard)
print('regex', regex)
print('type', type)
if wildcard == 'true' or regex == 'true':
with concurrent.futures.ThreadPoolExecutor() as executor:
t1 = executor.submit(getRecordsFromDB, obj)
final = t1.result()
return final
Called by the above method this where regex queries are performed
def getRecordsFromDB(obj):
max_limit = 10000
if obj['wildcard'] == "false" and obj['regex'] == "true":
print("yes regex thing")
if obj['type'] == 'domain':
obj['query'] = r'.+@{1}' + obj['query']
obj['type'] = 'email'
try:
pagination_class = LimitOffsetPagination
paginator = pagination_class()
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
}).count()
if queryset > max_limit:
return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
else:
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
})
page = paginator.paginate_queryset(queryset, obj['request'])
serializer = UserSerializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
except Exception as err:
print(f'Other error occurred: {err}')
return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)
elif obj['wildcard'] == "true" and obj['regex'] == "false":
print("yes wildcard thing")
#obj['query'] = obj['query'].replace('.', r'\.')
obj['query'] = re.escape(obj['query'])
obj['query'] = obj['query'].replace('\*', r'[a-zA-Z0-9-_.]*')
print('below is the respective regex for the given query')
print(obj['query'])
if obj['query'][0] != r'*' and obj['type'] != 'domain':
print('yes here where it should not be')
obj['query'] = r'^' + obj['query']
if len(obj['query']) > 1:
if obj['query'][-1] != r'*':
obj['query'] = obj['query'] + r'$'
print('final regex ', obj['query'])
if obj['type'] == 'domain':
obj['query'] = r'.+@{1}' + obj['query']
obj['type'] = 'email'
print('very final regex ', obj['query'])
try:
pagination_class = LimitOffsetPagination
paginator = pagination_class()
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
}).count()
if queryset > max_limit:
return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
else:
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
})
page = paginator.paginate_queryset(queryset, obj['request'])
serializer = UserSerializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
except Exception as err:
print(f'Other error occurred: {err}')
return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)
return records