The table columns appear to have fixed widths.
Without having more of the table, it is hard to reliably determine if a row is meaningful or not. I've dropped rows if the value in the first column doesn't appear to be a number like 3.500000+6 by looking for .
at the exact position.
The numbers in the first 6 columns seem to be in scientific notation but without the e
. Code fixing that is included at the bottom.
import pandas as pd
filename = r"C:\Users\Bobson Dugnutt\Desktop\table.txt"
column_widths = (11, 11, 11, 11, 11, 11, 4, 2, 3, 5)
df = pd.read_fwf(filename, widths=column_widths, header=None)
print(df, end="\n\n")
# Returns True if it looks like a number like "3.500000+6"
def is_meaningful_row(value):
#print(value)
if isinstance(value, str):
if value[-9] == ".":
return True
return False
# Remove rows if the first column doesn't look like a number like "3.500000+6"
df = df[df[0].map(is_meaningful_row)]
# Reset the index so the first row is no longer index 5
df = df.reset_index(drop=True)
print(df, end="\n\n")
# Replace NA/NaN values with empty strings
df = df.fillna("")
print(df, end="\n\n")
# Transforms a string like "-2.039133-1" to "-2.039133e-1"
def fix_scientific_notation(value):
if isinstance(value, str):
return value[0:1] + value[1:].replace("+", "e+").replace("-", "e-")
return value
# For the first 6 columns, fixes the strange scientific notation and converts
# values from string to float64 or int64
for column in df.columns[:6]:
df[column] = df[column].map(fix_scientific_notation)
df[column] = pd.to_numeric(df[column])
print(df)
Output:
0 1 2 3 4 5 6 7 8 9
0 References NaN NaN NaN NaN NaN 425 1 451 259
1 /1/J.H.Gibb ons,R.L.Mac klin:Phys.R ev.B137,150 8(1965) NaN 425 1 451 260
2 /2/P.R.Wrea n,C.R.Brune ,R.W.Kavana gh:Phys.Rev .C49,1205(1 994) 425 1 451 261
3 /3/L.Van de r Zwan,K.W. Geiger:Nucl .Phys.A152, 481(1970) NaN 425 1 451 262
4 /4/T.Murata :JAERI-Conf 98-003,p.2 15 NaN NaN 425 1 451 263
5 3.500000+6 3.844649-2 3.508375+6 3.783472-2 4.000000+6 -2.064883-2 425 3 2 7
6 4.014688+6 -2.289045-2 4.403160+6 -8.623264-2 4.500000+6 -1.022847-1 425 3 2 8
7 1.450000+7 -2.039133-1 1.500000+7 -1.930321-1 NaN NaN 425 3 2 17
0 1 2 3 4 5 6 7 8 9
0 3.500000+6 3.844649-2 3.508375+6 3.783472-2 4.000000+6 -2.064883-2 425 3 2 7
1 4.014688+6 -2.289045-2 4.403160+6 -8.623264-2 4.500000+6 -1.022847-1 425 3 2 8
2 1.450000+7 -2.039133-1 1.500000+7 -1.930321-1 NaN NaN 425 3 2 17
0 1 2 3 4 5 6 7 8 9
0 3.500000+6 3.844649-2 3.508375+6 3.783472-2 4.000000+6 -2.064883-2 425 3 2 7
1 4.014688+6 -2.289045-2 4.403160+6 -8.623264-2 4.500000+6 -1.022847-1 425 3 2 8
2 1.450000+7 -2.039133-1 1.500000+7 -1.930321-1 425 3 2 17
0 1 2 3 4 5 6 7 8 9
0 3500000.0 0.038446 3508375.0 0.037835 4000000.0 -0.020649 425 3 2 7
1 4014688.0 -0.022890 4403160.0 -0.086233 4500000.0 -0.102285 425 3 2 8
2 14500000.0 -0.203913 15000000.0 -0.193032 NaN NaN 425 3 2 17