I was trying to create a composite PTransform as following (Python):
class LimitVolume(beam.PTransform):
def __init__(self, daily_window, daily_limit):
super().__init__()
self.daily_window = daily_window
self.daily_limit = daily_limit
def expand(self, input_events_pcoll):
events_with_ts_pcol = (input_events_pcoll
| 'Timestamp using RECEIVED_TIMESTAMP' >> beam.Map(
lambda message: beam.window.TimestampedValue(message, message['RECEIVED_TIMESTAMP']))
)
...
return events_with_ts_pcol
and then use it in the main run() method as following:
def run():
...
result_pcol = input_pcol | LimitVolume(daily_window, daily_limit)
both run() and LimitVolume are in the same main.py script, which is then submitted/deployed as a job into GCP
When I run this job locally via DirectRunner - everything works fine; If I submit and run it using DataflowRunner in GCP - it start throwing errors like:
in process NameError: name 'arrow' is not defined [while running 'Parse Json-ptransform-898945']
and in <lambda> NameError: name 'time' is not defined [while running 'define schedule-ptransform-899107']
basically not finding a lot of dependencies that are all defined in the requirements.txt file and specified via the --requirements_file option when deploying the job
See the full error stacktrace (abbreviated) below.
Now, the punch line:
If I put the same logic I have in the LimitVolume PTransform into the run() method and specify in my pipeline directly:
def run():
...
events_with_ts_pcol = (input_pcol
| 'Timestamp using RECEIVED_TIMESTAMP' >> beam.Map(
lambda message: beam.window.TimestampedValue(message, message['RECEIVED_TIMESTAMP']))
)
...
and remove the definition of LimitVolume Class from the main.py file - it WORKS fine both locally and in GCP! no issues with dependencies.
So, clearly there is something very "special" about the sole existence of a custom PTransform in the pipeline - anybody knows what that might be?
I could not find any information on either custom PTransforms, or specifics of packaging with it, or errors like this - which in itself is worrisome ...
Thank you!!
Here is a larger output of the errors:
File "apache_beam/runners/common.py", line 1232, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process File "apache_beam/runners/common.py", line 1368, in apache_beam.runners.common._OutputProcessor.process_outputs File "/Users/mpopova/Marina/TT_Projects/gcp_inboundconverter/ibc_ingest_dataflow/src/main.py", line 45, in process NameError: name 'arrow' is not defined During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 284, in _execute response = task() File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 357, in <lambda> lambda: self.create_worker().do_instruction(request), request) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 602, in do_instruction getattr(request, request_type), request.instruction_id) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/sdk_worker.py", line 639, in process_bundle bundle_processor.process_bundle(instruction_id)) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 997, in process_bundle element.data) File "/usr/local/lib/python3.7/site-packages/apache_beam/runners/worker/bundle_processor.py", line 222, in process_encoded self.output(decoded_value) File "apache_beam/runners/worker/operations.py", line 351, in apache_beam.runners.worker.operations.Operation.output File "apache_beam/runners/worker/operations.py", line 353, in apache_beam.runners.worker.operations.Operation.output File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/common.py", line 1234, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1315, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "apache_beam/runners/common.py", line 1232, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process File "apache_beam/runners/common.py", line 1368, in apache_beam.runners.common._OutputProcessor.process_outputs File "/Users/mpopova/Marina/TT_Projects/gcp_inboundconverter/ibc_ingest_dataflow/src/main.py", line 45, in process NameError: name 'arrow' is not defined [while running 'Parse Json-ptransform-898945'] passed through: ==> dist_proc/dax/workflow/worker/fnapi_service_impl.cc:771
...
line 1234, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1299, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "apache_beam/runners/common.py", line 1232, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 571, in apache_beam.runners.common.SimpleInvoker.invoke_process File "apache_beam/runners/common.py", line 1395, in apache_beam.runners.common._OutputProcessor.process_outputs File "apache_beam/runners/worker/operations.py", line 215, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive File "apache_beam/runners/worker/operations.py", line 712, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/worker/operations.py", line 713, in apache_beam.runners.worker.operations.DoOperation.process File "apache_beam/runners/common.py", line 1234, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 1315, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "apache_beam/runners/common.py", line 1232, in apache_beam.runners.common.DoFnRunner.process File "apache_beam/runners/common.py", line 572, in apache_beam.runners.common.SimpleInvoker.invoke_process File "/Users/mpopova/Marina/TT_Projects/gcp_inboundconverter/ibc_ingest_dataflow/venv/lib/python3.7/site-packages/apache_beam/transforms/core.py", line 1562, in <lambda> wrapper = lambda x: [fn(x)] File "/Users/mpopova/Marina/TT_Projects/gcp_inboundconverter/ibc_ingest_dataflow/src/main.py", line 273, in <lambda> NameError: name 'time' is not defined [while running 'define schedule-ptransform-899107'] passed through: ==> dist_proc/dax/workflow/worker/fnapi_service_impl.cc:771