0

I am trying to load the 8m[rows] * 1k[columns] python dataframe into Mongo. For the performance improvement I have planned to use Mongo bulk operation.There will be update also I have to do in the collections on daily basis so that I used the upsert method of bulk operation. Below is the code which I have prepared,

def BulkLoad(self, Dataset):

    counter = 0;        
    empty = '000'

    columns = []
    records = []
    DataDict = {}
    for col in Dataset.columns:
        columns.append(col)

    try:
        db = self.getDatabase()
        bulk = db.collection.initialize_ordered_bulk_op()

        for j in range(len(Dataset)):
            records.clear()
            DataDict.clear()
            DataDict.update(
                    {'CreatedBy': empty, 'ModifiedBy': empty, 'Value': Score})

            for column in columns:
                colValue = str(Dataset[column][j])
                if (colValue == 'nan'):
                    colValue = colValue.replace('nan', '')


                DataDict.update({column: colValue})

            records.append(DataDict)
            print("list is ",records)

            Id = DataDict['Id']
            Number = DataDict['Number']

            print(DataDict)               

            bulk.find(
                    {'Id': Id, 'Number': Number}).upsert().update(
                    {
                         '$set': {'Id': Id, 'Number': Number,'Events':records}
                    })

            counter += 1

            if counter % 1000 == 0:
                result = bulk.execute()
                logging.info(pprint(result))
                bulk = db.coll.initialize_ordered_bulk_op()

        if counter % 1000 != 0:
            result = bulk.execute()
            logging.info(pprint(result))

    except Exception as e:
        logging.exception(e)
    except BulkWriteError as bre:
        logging.error(pprint(bre.details))

If I am loading the sample rows of 10 into Mongo collections, All the documents are having the same value of 10th row. I knew its because of python dictionary reference problem .

Can you please anyone give me suggestion on that ?

1 Answers1

0

def BulkLoad(self, Dataset):

counter = 0;        
empty = '000'

columns = []
records = []

for col in Dataset.columns:
    columns.append(col)

try:
    db = self.getDatabase()
    bulk = db.collection.initialize_ordered_bulk_op()

    for j in range(len(Dataset)):
        DataDict = {}            
        DataDict.update(
                {'CreatedBy': empty, 'ModifiedBy': empty, 'Value': Score})

        for column in columns:
            colValue = str(Dataset[column][j])
            if (colValue == 'nan'):
                colValue = colValue.replace('nan', '')


            DataDict.update({column: colValue})

        records.append(DataDict)
        print("list is ",records)

        Id = DataDict['Id']
        Number = DataDict['Number']

        print(DataDict)               

        bulk.find(
                {'Id': Id, 'Number': Number}).upsert().update(
                {
                     '$set': {'Id': Id, 'Number': Number,'Events':records}
                })

        counter += 1

        if counter % 1000 == 0:
            result = bulk.execute()
            logging.info(pprint(result))
            bulk = db.coll.initialize_ordered_bulk_op()

    if counter % 1000 != 0:
        result = bulk.execute()
        logging.info(pprint(result))

except Exception as e:
    logging.exception(e)
except BulkWriteError as bre:
    logging.error(pprint(bre.details))