1

What I'm doing:

  • I'm executing a query from a mysql table and exporting each day's worth of data into a folder
  • I then insert each csv row by row using a for loop into a separate mysql table
  • Once loaded into the table, I then move the csv into another separate folder

The problem is that it is taking a very long time to run and would like some help to find out areas where I can speed up the process or suggestions for alternative methods in Python.

Code:

import pymysql
import pymysql.cursors
import csv
import os
import shutil
import datetime
from db_credentials import db1_config, db2...

def date_range(start, end):
    # Creates a list of dates from start to end
    ...

def export_csv(filename, data):
    # Exports query result as a csv to the filename's pending folder
    ...

def extract_from_db(database, sql, start_date, end_date, filename):
    # SQL query to extract data and export as csv
    ...

def open_csv(c):
    # Read csv and return as a list of lists
    ...

def get_files(folder):
    # Grab all csv files from a given folder's pending folder
    ...

# HERE IS WHERE IT GETS SLOW

def load_to_db(table, folder):

    print('Uploading...\n')

    files = get_files(folder)

    # Connect to db2 database
    connection = pymysql.connect(**db2_config, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

    try:
        with connection.cursor() as cursor:

            # Open each csv in the files list and ignore column headers
            for file in files:                
                print('Processing ' + file.split("pending/",1)[1] + '...', end='')

                csv_file = open_csv(file)
                csv_headers = ', '.join(csv_file[0])
                csv_data = csv_file[1:]

                # Insert each row of each csv into db2 table
                for row in csv_data:
                    placeholders = ', '.join(['%s'] * len(row))
                    sql = "INSERT INTO %s (%s) VALUES ( %s )" % (table, csv_headers, placeholders)
                    cursor.execute(sql, row)

                # Move processed file to the processed folder
                destination_folder = os.path.join('/Users','python', folder, 'processed')
                shutil.move(file, destination_folder)

                print('DONE')

        # Connection is not autocommit by default.
        # So you must commit to save your changes.
        connection.commit()

    finally:
        connection.close()

    if not files:
        print('No csv data available to process')
    else:
        print('Finished')
AK91
  • 671
  • 2
  • 13
  • 35

1 Answers1

0

How about trying mysql LOAD DATA

e.g. execute the following statements for the entire csv rather than individual inserts

LOAD DATA INFILE '<your filename>' 
INTO TABLE <your table> 
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;