I have a large json data file with 3.7gb. Iam going to load the json file to dataframe and delete unused columns than convert it to csv and load to sql. ram is 40gb My json file structure
{"a":"Ho Chi Minh City, Vietnam","gender":"female","t":"841675194476","id":"100012998502085","n":"Lee Mến"}
{"t":"84945474479","id":"100012998505399","n":"Hoàng Giagia"}
{"t":"841679770421","id":"100012998505466","n":"Thoại Mỹ"}
I try to load data but it fails because of out of memory
data_phone=[]
with open('data.json', 'r', encoding="UTF-8") as f:
numbers = ijson.items(f, 't',multiple_values=True)
for num in numbers :
data_phone.append(num)
It shows errors
Out of memory
I try another way
import json fb_data={} i=1
with open('output.csv', 'w') as csv_file:
with open("Vietnam_Facebook_Scrape.json", encoding="UTF-8") as json_file:
for line in json_file:
data = json.loads(line)
try:
csv_file.write('; '.join([str(i),"/",data["t"],data["fbid"]]))
except:
pass
Then I convert from csv to sql, it still show error "MemoryError:"
con = db.connect("fbproject.db")
cur = con.cursor()
with open('output.csv', 'r',encoding="UTF-8") as csv_file:
for item in csv_file:
cur.execute('insert into fbdata values (?)', (item,))
con.commit()
con.close()
Thanks for reading