I want to find duplicates (or almost duplicates) in the names of a people
dataframe and using Levenshtein distance (2 names separated by at most 1 in Levenshtein distance are considered duplicates). This implies to calculate the levenshtein distance between all the names of the DataFrame which is almost 300 000 rows.
input people
is like:
| name |
|---------------------|
| jack |
| john |
| jack |
| jackz |
output should be like:
| name | duplicates |
|---------------------|------------------|
| jack | [0,2,3] |
| john | [1] |
| jack | [0,2,3] |
| jackz | [0,2,3] |
That is why I'm using the cuDF library on a GPU. Using the functions from this library allowed me to accelerate the running a lot, but it is still not sufficient to run on the entire dataframe.
My code is the following
import pandas as pd
import cudf
people=pd.read_csv('people.csv')
df1 = people
def filter_rows(row: pd.Series) -> pd.Series:
# By default, the row doesn't need to be removed
row["duplicates"] = []
# Loop over the texts in the other dataframe
targets = cudf.Series(list(df1['name']))
sr = cudf.Series((len(targets)*[row['name']]))
#edit_distance is part of cudf library and calculates Levenshtein distance between series
dist=sr.str.edit_distance(targets=targets)
dist=dist.where(dist<=1).dropna()
row["duplicates"]=dist.index.to_arrow().to_pylist()
return row
# Apply the function (this will create a new column called "duplicates", indicating the list of indexes that have names at most 1 levenshtein distance away from the row
people_with_dup = (df1).apply(filter_rows, axis=1)
To accelerate the code, I tried to convert the people dataframe into a cudf dataframe, adding this after line 1:
people=cu.from_pandas(people)
But when running the code I get the following error:
---------------------------------------------------------------------------
TypingError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/cudf/core/indexed_frame.py:2136, in IndexedFrame._apply(self, func, kernel_getter, *args, **kwargs)
2135 try:
-> 2136 kernel, retty = _compile_or_get(
2137 self, func, args, kernel_getter=kernel_getter
2138 )
2139 except Exception as e:
File /opt/conda/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
File /opt/conda/lib/python3.10/site-packages/cudf/core/udf/utils.py:298, in _compile_or_get(frame, func, args, kernel_getter)
295 # precompile the user udf to get the right return type.
296 # could be a MaskedType or a scalar type.
--> 298 kernel, scalar_return_type = kernel_getter(frame, func, args)
299 np_return_type = (
300 numpy_support.as_dtype(scalar_return_type)
301 if scalar_return_type.is_internal
302 else scalar_return_type.np_dtype
303 )
File /opt/conda/lib/python3.10/site-packages/cudf/core/udf/row_function.py:146, in _get_row_kernel(frame, func, args)
143 row_type = _get_frame_row_type(
144 np.dtype(list(_all_dtypes_from_frame(frame).items()))
145 )
--> 146 scalar_return_type = _get_udf_return_type(row_type, func, args)
147 # this is the signature for the final full kernel compilation
File /opt/conda/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
File /opt/conda/lib/python3.10/site-packages/cudf/core/udf/utils.py:144, in _get_udf_return_type(argty, func, args)
142 # Get the return type. The PTX is also returned by compile_udf, but is not
143 # needed here.
--> 144 ptx, output_type = cudautils.compile_udf(func, compile_sig)
146 if not isinstance(output_type, MaskedType):
File /opt/conda/lib/python3.10/site-packages/cudf/utils/cudautils.py:250, in compile_udf(udf, type_signature)
248 # We haven't compiled a function like this before, so need to fall back to
249 # compilation with Numba
--> 250 ptx_code, return_type = cuda.compile_ptx_for_current_device(
251 udf, type_signature, device=True
252 )
253 if not isinstance(return_type, cudf.core.udf.masked_typing.MaskedType):
File /opt/conda/lib/python3.10/site-packages/numba/cuda/compiler.py:293, in compile_ptx_for_current_device(pyfunc, args, debug, lineinfo, device, fastmath, opt)
292 cc = get_current_device().compute_capability
--> 293 return compile_ptx(pyfunc, args, debug=debug, lineinfo=lineinfo,
294 device=device, fastmath=fastmath, cc=cc, opt=True)
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
34 with self:
---> 35 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/numba/cuda/compiler.py:269, in compile_ptx(pyfunc, args, debug, lineinfo, device, fastmath, cc, opt)
262 nvvm_options = {
263 'debug': debug,
264 'lineinfo': lineinfo,
265 'fastmath': fastmath,
266 'opt': 3 if opt else 0
267 }
--> 269 cres = compile_cuda(pyfunc, None, args, debug=debug, lineinfo=lineinfo,
270 fastmath=fastmath,
271 nvvm_options=nvvm_options)
272 resty = cres.signature.return_type
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
34 with self:
---> 35 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/numba/cuda/compiler.py:212, in compile_cuda(pyfunc, return_type, args, debug, lineinfo, inline, fastmath, nvvm_options)
211 with target_override('cuda'):
--> 212 cres = compiler.compile_extra(typingctx=typingctx,
213 targetctx=targetctx,
214 func=pyfunc,
215 args=args,
216 return_type=return_type,
217 flags=flags,
218 locals={},
219 pipeline_class=CUDACompiler)
221 library = cres.library
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler.py:716, in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
714 pipeline = pipeline_class(typingctx, targetctx, library,
715 args, return_type, flags, locals)
--> 716 return pipeline.compile_extra(func)
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler.py:452, in CompilerBase.compile_extra(self, func)
451 self.state.lifted_from = None
--> 452 return self._compile_bytecode()
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler.py:520, in CompilerBase._compile_bytecode(self)
519 assert self.state.func_ir is None
--> 520 return self._compile_core()
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler.py:499, in CompilerBase._compile_core(self)
498 if is_final_pipeline:
--> 499 raise e
500 else:
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler.py:486, in CompilerBase._compile_core(self)
485 try:
--> 486 pm.run(self.state)
487 if self.state.cr is not None:
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_machinery.py:368, in PassManager.run(self, state)
367 patched_exception = self._patch_error(msg, e)
--> 368 raise patched_exception
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_machinery.py:356, in PassManager.run(self, state)
355 if isinstance(pass_inst, CompilerPass):
--> 356 self._runPass(idx, pass_inst, state)
357 else:
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
34 with self:
---> 35 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_machinery.py:311, in PassManager._runPass(self, index, pss, internal_state)
310 with SimpleTimer() as pass_time:
--> 311 mutated |= check(pss.run_pass, internal_state)
312 with SimpleTimer() as finalize_time:
File /opt/conda/lib/python3.10/site-packages/numba/core/compiler_machinery.py:273, in PassManager._runPass.<locals>.check(func, compiler_state)
272 def check(func, compiler_state):
--> 273 mangled = func(compiler_state)
274 if mangled not in (True, False):
File /opt/conda/lib/python3.10/site-packages/numba/core/typed_passes.py:105, in BaseTypeInference.run_pass(self, state)
102 with fallback_context(state, 'Function "%s" failed type inference'
103 % (state.func_id.func_name,)):
104 # Type inference
--> 105 typemap, return_type, calltypes, errs = type_inference_stage(
106 state.typingctx,
107 state.targetctx,
108 state.func_ir,
109 state.args,
110 state.return_type,
111 state.locals,
112 raise_errors=self._raise_errors)
113 state.typemap = typemap
File /opt/conda/lib/python3.10/site-packages/numba/core/typed_passes.py:81, in type_inference_stage(typingctx, targetctx, interp, args, return_type, locals, raise_errors)
79 infer.seed_type(k, v)
---> 81 infer.build_constraint()
82 # return errors in case of partial typing
File /opt/conda/lib/python3.10/site-packages/numba/core/typeinfer.py:1039, in TypeInferer.build_constraint(self)
1038 for inst in blk.body:
-> 1039 self.constrain_statement(inst)
File /opt/conda/lib/python3.10/site-packages/numba/core/typeinfer.py:1386, in TypeInferer.constrain_statement(self, inst)
1385 if isinstance(inst, ir.Assign):
-> 1386 self.typeof_assign(inst)
1387 elif isinstance(inst, ir.SetItem):
File /opt/conda/lib/python3.10/site-packages/numba/core/typeinfer.py:1459, in TypeInferer.typeof_assign(self, inst)
1458 elif isinstance(value, (ir.Global, ir.FreeVar)):
-> 1459 self.typeof_global(inst, inst.target, value)
1460 elif isinstance(value, ir.Arg):
File /opt/conda/lib/python3.10/site-packages/numba/core/typeinfer.py:1559, in TypeInferer.typeof_global(self, inst, target, gvar)
1558 try:
-> 1559 typ = self.resolve_value_type(inst, gvar.value)
1560 except TypingError as e:
File /opt/conda/lib/python3.10/site-packages/numba/core/typeinfer.py:1480, in TypeInferer.resolve_value_type(self, inst, val)
1479 msg = str(e)
-> 1480 raise TypingError(msg, loc=inst.loc)
TypingError: Failed in cuda mode pipeline (step: nopython frontend)
Untyped global name 'list': Cannot determine Numba type of <class 'type'>
File "../../../../tmp/ipykernel_6251/2531843191.py", line 10:
<source missing, REPL/exec in use?>
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
File <timed exec>:1
File /opt/conda/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
98 @wraps(func)
99 def inner(*args, **kwargs):
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
103 return result
File /opt/conda/lib/python3.10/site-packages/cudf/core/dataframe.py:4383, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
4380 if result_type is not None:
4381 raise ValueError("The `result_type` kwarg is not yet supported.")
-> 4383 return self._apply(func, _get_row_kernel, *args, **kwargs)
File /opt/conda/lib/python3.10/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
76 @wraps(func)
77 def inner(*args, **kwds):
78 with self._recreate_cm():
---> 79 return func(*args, **kwds)
File /opt/conda/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
98 @wraps(func)
99 def inner(*args, **kwargs):
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
103 return result
File /opt/conda/lib/python3.10/site-packages/cudf/core/indexed_frame.py:2140, in IndexedFrame._apply(self, func, kernel_getter, *args, **kwargs)
2136 kernel, retty = _compile_or_get(
2137 self, func, args, kernel_getter=kernel_getter
2138 )
2139 except Exception as e:
-> 2140 raise ValueError(
2141 "user defined function compilation failed."
2142 ) from e
2144 # Mask and data column preallocated
2145 ans_col = _return_arr_from_dtype(retty, len(self))
ValueError: user defined function compilation failed.
Do you have an idea of why? Or on how can I acccelerate the running more? thanks