I'm importing a csv list of cities into my Django app. I'm pretty new to Django and Python, and the import runs reasonably quickly, the first 25,000 rows take about 5 minutes, the next 25,000 take 2 hours. I stopped the import and started again where it left off, the next 25,000 took about 4 minutes. Obviously I'm doing it wrong as it appears that it slows down with every insert.
Any help would be great, I'm mainly doing this to learn rather than just importing data, it's currently quicker to just import straight into postgresql so I can keep going with my project, but I'd like to know what I'm doing wrong so I can get better eith Django / Python.
tia
from myapp import Country, State, City
def add_country(isocode, name):
c = Country.objects.get_or_create(name=name.strip().replace('"', ''), isocode=isocode.strip())[0]
return c
def add_state(country, isocode, name, statetype):
country_model = Country.objects.get(isocode=country.strip().lower())
s = State.objects.get_or_create(name=name.strip().replace('"', ''), isocode=isocode.strip().lower().replace('"', ''), country=country_model, statetype=statetype.strip().replace('"', ''))[0]
return s
def add_city(country, state, name):
country_model = Country.objects.get(isocode=country.strip().lower().replace('"', ''))
try:
state_model = State.objects.get(name=state.strip().replace('"', ''), country=country_model)
except State.DoesNotExist:
state_model = None
ci = City.objects.get_or_create(name=name.strip().replace('"', ''), state=state_model, postcode='')[0]
return ci
with open('country.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_country(counrow[0], counrow[1])
with open('state.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_state(counrow[0], counrow[1], counrow[2], counrow[3])
with open('city1.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2])
with open('city2.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2])
with open('city3.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2])
UPDATE:
So I've changed the code to use bulk insert, the first group of cities is now just over two minutes, the second lot is 10 minutes, I haven't finished with the third after a couple of hours. There must be some sort of garbage collection process or something that I'm missing, as I've even switched the files around, each file takes the same amount of time when it's run first.
The new code looks like this:
def add_country(isocode, name, created_by, changed_by, country_list):
country_list.append(Country(name=name.strip().replace('"', ''), isocode=isocode.strip()))
def add_state(country, isocode, name, statetype, created_by, changed_by, state_list):
country_model = Country.objects.get(isocode=country.strip().lower())
state_list.append(State(name=name.strip().replace('"', ''), isocode=isocode.strip().lower().replace('"', ''), country=country_model, statetype=statetype.strip().replace('"', '')))
def add_city(country, state, name, created_by, changed_by, city_list):
country_model = Country.objects.get(isocode=country.strip().lower().replace('"', ''))
try:
state_model = State.objects.get(name=state.strip().replace('"', ''), country=country_model)
except State.DoesNotExist:
state_model = None
city_list.append(City(name=name.strip().replace('"', ''), state=state_model, postcode=''))
country_list = []
state_list = []
city_list = []
print "Countries"
print time.strftime("%H:%M:%S")
with open('country.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_country(counrow[0], counrow[1], adminuser, adminuser, country_list)
Country.objects.bulk_create(country_list)
print "States"
print time.strftime("%H:%M:%S")
with open('state.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_state(counrow[0], counrow[1], counrow[2], counrow[3], adminuser, adminuser, state_list)
State.objects.bulk_create(state_list)
print "Cities 1"
print time.strftime("%H:%M:%S")
with open('city1.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2], adminuser, adminuser, city_list)
City.objects.bulk_create(city_list)
print "Cities 2"
print time.strftime("%H:%M:%S")
city_list = []
with open('city2.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2], adminuser, adminuser, city_list)
City.objects.bulk_create(city_list)
print "Cities 3"
print time.strftime("%H:%M:%S")
city_list = []
with open('city3.csv', 'rb') as csvfile:
myreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for counrow in myreader:
add_city(counrow[0], counrow[1], counrow[2], adminuser, adminuser, city_list)
City.objects.bulk_create(city_list)