I have create this File Watcher Loop, when I run the code, it scans a specific folder for .json files and append to an 'output.xls' file. Then the code continues to run in a loop, scanning new files in the folder, and repeat the process. This works just fine, however, when I stop the code (laptop turn-off, or something), new files are still being added to the folder, and then when I re-run the code, I cannot continue where I left off, I have to delete the output.xls file and start over again.
Is there a way for this to save the history of the files already appended when I stop the code, and continue adding files that have not been appended when I re-run the code?
import os
import glob
import json
import pandas as pd
import time
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
def jsonFilesInDirectory(my_dir: str):
# Get a list of JSON files in the directory
json_files = glob.glob(os.path.join(my_dir, "*.json"))
return json_files
def clean_value(value):
# Clean up numeric values by removing unnecessary characters
return float(value.replace('\xa0s', '').replace('\xa0ms', '').replace(',', ''))
def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
for file_name in fileDiff:
file_path = os.path.join(my_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
try:
json_data = json.load(file)
# Extract data from the JSON file
url = json_data["finalUrl"]
fetch_time = json_data["fetchTime"]
audits = json_data["audits"]
fcp_metric = audits["first-contentful-paint"]["id"]
fcp_value = audits["first-contentful-paint"]["displayValue"]
fcp_score = audits["first-contentful-paint"]["score"]
lcp_metric = audits["largest-contentful-paint"]["id"]
lcp_value = audits["largest-contentful-paint"]["displayValue"]
lcp_score = audits["largest-contentful-paint"]["score"]
fmp_metric = audits["first-meaningful-paint"]["id"]
fmp_value = audits["first-meaningful-paint"]["displayValue"]
fmp_score = audits["first-meaningful-paint"]["score"]
si_metric = audits["speed-index"]["id"]
si_value = audits["speed-index"]["displayValue"]
si_score = audits["speed-index"]["score"]
tbt_metric = audits["total-blocking-time"]["id"]
tbt_value = audits["total-blocking-time"]["displayValue"]
tbt_score = audits["total-blocking-time"]["score"]
cls_metric = audits["cumulative-layout-shift"]["id"]
cls_value = audits["cumulative-layout-shift"]["displayValue"]
cls_score = audits["cumulative-layout-shift"]["score"]
categories = json_data["categories"]
perf_metric = categories["performance"]["id"]
perf_value = 0
perf_score = categories["performance"]["score"]
# Clean up values and format the fetch time
cleaned_fcp_value = clean_value(fcp_value)
cleaned_lcp_value = clean_value(lcp_value)
cleaned_fmp_value = clean_value(fmp_value)
cleaned_si_value = clean_value(si_value)
cleaned_tbt_value = clean_value(tbt_value)
datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
# Create a data dictionary for the DataFrame
data_dict = {
"fetch_time": [cleaned_fetch_time] * 7,
"url": [url] * 7,
"metric": ["performance","first_contentful_paint", "largest_contentful_paint",
"first-meaningful-paint", "speed-index", "total-blocking-time",
"cumulative-layout-shift"],
"value": [perf_value, cleaned_fcp_value, cleaned_lcp_value,
cleaned_fmp_value, cleaned_si_value, cleaned_tbt_value,
cls_value],
"score": [perf_score, fcp_score, lcp_score, fmp_score, si_score, tbt_score, cls_score]
}
df = pd.DataFrame(data_dict)
# Append the DataFrame to the Excel file
sheet_name = "Sheet1"
if sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
startrow = sheet.max_row
for row in dataframe_to_rows(df, index=False, header=False):
sheet.append(row)
else:
sheet = workbook.create_sheet(sheet_name)
for row in dataframe_to_rows(df, index=False, header=True):
sheet.append(row)
print(f"Data extracted from {file_name} and appended to the Excel file")
except KeyError as e:
print(f"KeyError occurred while processing file '{file_name}': {e}")
except json.JSONDecodeError as e:
print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
except Exception as e:
print(f"An error occurred while processing file '{file_name}': {e}")
def fileWatcher(my_dir: str, pollTime: int):
excel_file_path = os.path.join(my_dir, 'output.xlsx')
existingFiles = []
# Check if the output file already exists
if os.path.isfile(excel_file_path):
try:
workbook = openpyxl.load_workbook(excel_file_path)
existingFiles = jsonFilesInDirectory(my_dir)
# Process the existing JSON files and append data to the Excel file
doThingsWithNewFiles(existingFiles, my_dir, workbook)
print("Existing JSON files processed and data appended to the Excel file")
except openpyxl.utils.exceptions.InvalidFileException:
workbook = openpyxl.Workbook()
else:
workbook = openpyxl.Workbook()
# Check for new files at startup
newFileList = jsonFilesInDirectory(my_dir)
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
while True:
time.sleep(pollTime)
# Get the updated list of JSON files in the directory
newFileList = jsonFilesInDirectory(my_dir)
# Find the difference between the previous and new file lists
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
def listComparison(originalList: list, newList: list):
# Compare two lists and return the differences
differencesList = [x for x in newList if x not in originalList]
return differencesList
my_dir = r"Z:"
pollTime = 60
fileWatcher(my_dir, pollTime)