3

I have a df:

    id  log
0   24  2*C316*first_field_name17*second_field_name16*third_field_name2*N311*field value1*Y5*hello2*O30*0*0*
1   25  2*C316*first_field_name17*second_field_name16*third_field_name2*N311*field value1*Y5*hello2*O30*0*0*

I have a function that parses the string:

dfs = []

def parse_log(id, log):
    split = log.split('*')
    number_of_fields = int(split[1][1:int(split[0][0])])

    i=2
    string_length = int(split[1][int(split[0][0]):])
    field_names_list = []
    while i < number_of_fields + 2:
        field_name = split[i][0:string_length]
        field_names_list.append(field_name)
        string_length = int(split[i][string_length:])
        i+=1

    i = 3 + number_of_fields
    string_length = int(split[2 + number_of_fields][string_length:])
    new_values_list = []
    while i < 3+number_of_fields*2:
        field_name = split[i][0:string_length]
        new_values_list.append(field_name)
        string_length = int(split[i][string_length:])
        i+=1

    i = 4 + number_of_fields*2
    string_length = int(split[3 + number_of_fields*2][string_length:])
    old_values_list = []
    while i <= 3 + number_of_fields*3:
        old_value = split[i][0:string_length]
        old_values_list.append(old_value)
        if i == 3 + number_of_fields*3:
            string_length = 0
        else:
            string_length = int(split[i][string_length:])
        i+=1

    df = pd.DataFrame(
    {'id':id,
     'field': field_names_list,
     'old_value': old_values_list,
     'new_value': new_values_list
    })

dfs.append(df)  

This function works with a normal pandas apply:

data.apply(lambda x: parse_audit_log(x['id'], x['log']), axis=1) 

Then I attempt to apply with dask instead (data in this case is a dask dataframe read from SQL):

out = data.map_partitions(lambda df : df.apply(lambda x : parse_audit_log(x.id,x.log),axis=1), meta=('result', int)).compute(get=get)

This results in the error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-12-2468010e0124> in <module>()
----> 1 out = data.map_partitions(lambda df : df.apply(lambda x : parse_audit_log(x.ROW_ID,x.AUDIT_LOG),axis=1), meta=('result', int)).compute(get=get)

~\_installed\anaconda\lib\site-packages\dask\base.py in compute(self, **kwargs)
    153         dask.base.compute
    154         """
--> 155         (result,) = compute(self, traverse=False, **kwargs)
    156         return result
    157 

~\_installed\anaconda\lib\site-packages\dask\base.py in compute(*args, **kwargs)
    402     postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
    403                     else (None, a) for a in args]
--> 404     results = get(dsk, keys, **kwargs)
    405     results_iter = iter(results)
    406     return tuple(a if f is None else f(next(results_iter), *a)

~\_installed\anaconda\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, **kwargs)
    175                            get_id=_process_get_id, dumps=dumps, loads=loads,
    176                            pack_exception=pack_exception,
--> 177                            raise_exception=reraise, **kwargs)
    178     finally:
    179         if cleanup:

~\_installed\anaconda\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    519                         _execute_task(task, data)  # Re-execute locally
    520                     else:
--> 521                         raise_exception(exc, tb)
    522                 res, worker_id = loads(res_info)
    523                 state['cache'][key] = res

~\_installed\anaconda\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
     64     def reraise(exc, tb=None):
     65         if exc.__traceback__ is not tb:
---> 66             raise exc.with_traceback(tb)
     67         raise exc
     68 

~\_installed\anaconda\lib\site-packages\dask\local.py in execute_task()
    288     try:
    289         task, data = loads(task_info)
--> 290         result = _execute_task(task, data)
    291         id = get_id()
    292         result = dumps((result, id))

~\_installed\anaconda\lib\site-packages\dask\local.py in _execute_task()
    269         func, args = arg[0], arg[1:]
    270         args2 = [_execute_task(a, cache) for a in args]
--> 271         return func(*args2)
    272     elif not ishashable(arg):
    273         return arg

~\_installed\anaconda\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
   3402 
   3403     Ensures the output has the same columns, even if empty."""
-> 3404     df = func(*args, **kwargs)
   3405     if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
   3406         if len(df) == 0:

<ipython-input-12-2468010e0124> in <lambda>()
----> 1 out = data.map_partitions(lambda df : df.apply(lambda x : parse_audit_log(x.ROW_ID,x.AUDIT_LOG),axis=1), meta=('result', int)).compute(get=get)

~\_installed\anaconda\lib\site-packages\pandas\core\frame.py in apply()
   4875                         f, axis,
   4876                         reduce=reduce,
-> 4877                         ignore_failures=ignore_failures)
   4878             else:
   4879                 return self._apply_broadcast(f, axis)

~\_installed\anaconda\lib\site-packages\pandas\core\frame.py in _apply_standard()
   4971             try:
   4972                 for i, v in enumerate(series_gen):
-> 4973                     results[i] = func(v)
   4974                     keys.append(v.name)
   4975             except Exception as e:

<ipython-input-12-2468010e0124> in <lambda>()
----> 1 out = data.map_partitions(lambda df : df.apply(lambda x : parse_audit_log(x.ROW_ID,x.AUDIT_LOG),axis=1), meta=('result', int)).compute(get=get)

<ipython-input-11-08a2f8f06a76> in parse_audit_log()
      1 def parse_audit_log(row_id, audit_log):
----> 2     split = audit_log.split('*')
      3     number_of_fields = int(split[1][1:int(split[0][0])])
      4 
      5     i=2

AttributeError: ("'NoneType' object has no attribute 'split'", 'occurred at index 1')

This is one error, but I have faced several others in tweaking the function to fit dask requirements. What am I missing with the dask application of the function? My meta tag is almost certainly not right. Although there are examples out there of dask applying functions, I haven't seen any involving functions without an explicit return.

Update - Desired Output:

Ideally, I will end up with a df:

   row_id  field_name          new_value      old_value
0   24      first_field_name    field value 
1   24      second_field_name   Y   
2   24      third_field_name    hello
3   25      first_field_name    field value 
4   25      second_field_name   Y   
5   25      third_field_name    hello

The problem (and the reason I'm attempting to use dask) is that the dataset is 55 million records. Since this parsing process explodes those records into one or more records, I need something efficient that can fit in my 32GB memory.

OverflowingTheGlass
  • 2,324
  • 1
  • 27
  • 75

1 Answers1

0

The error is pointing to an issue with splitting a None while str is expected. A simple solution is to ensure that the relevant column has empty strings instead of None, for example:

df['log'] = df['log'].fillna("")
SultanOrazbayev
  • 14,900
  • 3
  • 16
  • 46