I'm trying to write a small python script which reads a .parquet
file with the following schema:
a | b | c | d |
---|---|---|---|
0 | x | 2 | y |
2 | 1 | x | z |
The script takes the following arguments:
- one input file
- multiple columns
- multiple search strings (can be strings, numbers or regex)
It then searches the given columns for the given search-string and returns the whole line of the DataFrame which contains the given value in the given column.
My problem now is how to write the search correctly, because with the current implementation i get the following error if i try to search a column with a dtype different than utf8: RuntimeError: Any(SchemaMisMatch("Series dtype UInt64 != utf8"))
Programm execution looks like this: pyton ./pqtmgr.py -f './test.parquet' -c 'a' -s '2'
#!/usr/bin/python
# Imports
import polars
import argparse
### MAIN ###
# Main
def main():
arguments = parse_arguments()
dataframe = polars.read_parquet(arguments.files_input)
dataframe = dataframe_search(arguments, dataframe)
### MISC ###
# Search DataFrame and return a result DataFrame
def dataframe_search(arguments, dataframe) -> polars.DataFrame:
dataframes = []
for column in arguments.columns:
for search in arguments.search:
dataframes.append(
dataframe.filter(
(polars.col(column).str.contains(search))
)
)
return polars.concat(dataframes, True, "diagonal")
### ARGUMENTS ###
# Parse given arguments
def parse_arguments():
parser = argparse.ArgumentParser(
prog='pqtmgr.py'
)
# Add argument to take an input file
parser.add_argument(
'-f',
'--file-input',
dest='fils_input',
help='''
Takes one filepath as input file which will be searched
''',
required=True
)
# Add argument to take a list of columns to search
parser.add_argument(
'-c',
'--columns',
dest='columns',
help='''
Accepts one or multiple columns that will be searched
''',
nargs='*',
required=True
)
# Add argument to search the given strings
parser.add_argument(
'-s',
'--search',
dest='search',
help='''
Accepts one or more strings or regular expression that are searched for in the given columns
''',
nargs='*'
)
# Execute Main
if __name__ == '__main__':
main()