0

I have a table in DB with almost 20 million records. I want to search through regex. It was all good when the number of records were about 100 thousand. But now it takes like quite a lot time even sometime it results in timeout. Does I need to migrate to SQL database may be postgresql or something like elastic search. As the records in this table is expected to increase more than 20 billion. Can there is a way to make it efficient by keeping the same settings as I'm using djongo to connect django to mongodb or I have to use any other database for fast searching.

My model schema is

from djongo import models as model
class User(model.Model):
    email = model.CharField(max_length=50, default='')
    source = model.CharField(default='unknown',max_length=150)
    username = model.CharField(max_length=150, default='')
    hash = model.CharField(max_length=255, default='')
    salt = model.CharField(max_length=255, default='')
    ipaddress = model.CharField(max_length=50,default='')
    lastipaddress = model.CharField(max_length=50,default='')
    name = model.CharField(max_length=150, default='')
    dateofbirth = model.CharField(max_length=100, default='')
    phonenumber = model.CharField(max_length=100, default='')
    firstname = model.CharField(max_length=150, default='')
    lastname = model.CharField(max_length=150, default='')
    address = model.CharField(max_length=255, default='')
    objects = model.DjongoManager()

This method is called when an post request is send to django

@api_view(['POST'])
@authentication_classes([authentication.TokenAuthentication])
@permission_classes([permissions.IsAdminUser])
def search(request):

if 'username' in request.data:
    username = request.data['username']

if 'email' in request.data:
    useremail = request.data['email']

if 'userid' in request.data:
    userid = request.data['userid']

if 'query' in request.data:
    query = request.data['query']
else:
    return Response(status.HTTP_400_BAD_REQUEST)

obj = {}
obj['query'] = query
obj['type'] = type
obj['wildcard'] = wildcard
obj['regex'] = regex
if not (type in ['email', 'domain', 'username'] and wildcard == 'false' and regex == 'false'):
    obj['request'] = request
final = []
print('wildcard', wildcard)
print('regex', regex)
print('type', type)
if wildcard == 'true' or regex == 'true':
    with concurrent.futures.ThreadPoolExecutor() as executor:
        t1 = executor.submit(getRecordsFromDB, obj)
        final = t1.result()

return final

Called by the above method this where regex queries are performed

def getRecordsFromDB(obj):
    max_limit = 10000
    if obj['wildcard'] == "false" and obj['regex'] == "true":
        print("yes regex thing")
        if obj['type'] == 'domain':
            obj['query'] = r'.+@{1}' + obj['query']
            obj['type'] = 'email'

        try:
            pagination_class = LimitOffsetPagination
            paginator = pagination_class()
            queryset = User.objects.mongo_find({
                obj['type']: {'$regex': obj['query']}
            }).count()
            if queryset > max_limit:
                return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
            else:
                queryset = User.objects.mongo_find({
                    obj['type']: {'$regex': obj['query']}
                })
            page = paginator.paginate_queryset(queryset, obj['request'])
            serializer = UserSerializer(page, many=True)
            return paginator.get_paginated_response(serializer.data)
        except Exception as err:
            print(f'Other error occurred: {err}')
            return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)

    elif obj['wildcard'] == "true" and obj['regex'] == "false":
        print("yes wildcard thing")
        #obj['query'] = obj['query'].replace('.', r'\.')
        obj['query'] = re.escape(obj['query'])
        obj['query'] = obj['query'].replace('\*', r'[a-zA-Z0-9-_.]*')
        print('below is the respective regex for the given query')
        print(obj['query'])
        if obj['query'][0] != r'*' and obj['type'] != 'domain':
            print('yes here where it should not be')
            obj['query'] = r'^' + obj['query']
        if len(obj['query']) > 1:
            if obj['query'][-1] != r'*':
                obj['query'] = obj['query'] + r'$'

        print('final regex ', obj['query'])
        if obj['type'] == 'domain':
            obj['query'] = r'.+@{1}' + obj['query']
            obj['type'] = 'email'
            print('very final regex ', obj['query'])
        try:
            pagination_class = LimitOffsetPagination
            paginator = pagination_class()
            queryset = User.objects.mongo_find({
                obj['type']: {'$regex': obj['query']}
            }).count()
            if queryset > max_limit:
                return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
            else:
                queryset = User.objects.mongo_find({
                    obj['type']: {'$regex': obj['query']}
                })
            page = paginator.paginate_queryset(queryset, obj['request'])
            serializer = UserSerializer(page, many=True)
            return paginator.get_paginated_response(serializer.data)
        except Exception as err:
            print(f'Other error occurred: {err}')
            return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)

    return records
Daud Ahmed
  • 190
  • 1
  • 10
  • Basically `queryset = User.objects.mongo_find({obj['type']: {'$regex': obj['query']}})` is the query that takes long? Can you show us an example of `obj['type']` and `obj['query']` where the query takes a long time? – dirkgroten Feb 18 '20 at 14:48
  • Yes exactly this query. – Daud Ahmed Feb 19 '20 at 03:26
  • queryset = User.objects.mongo_find({ "email": {'$regex': ".+@{1}@apple\.com$" }}) – Daud Ahmed Feb 19 '20 at 03:28
  • Have you indexed the `email`? I know indexing is part of the professional djongo package so it’s not free but it’s crucial at that scale. – dirkgroten Feb 19 '20 at 07:46
  • I didn't. It's kinda expensive like paying 50$ per month. – Daud Ahmed Feb 19 '20 at 08:17
  • That’s cheap. With the scale you’re talking about you won’t find any solution that doesn’t cost money. Elasticsearch in the cloud isn’t cheap either. – dirkgroten Feb 19 '20 at 08:21

0 Answers0