3

I've been following this tutorial on how to set up and use Apache Arrow Flight.

From the example, server.py:

import pyarrow as pa
import pyarrow.flight as fl

def create_table_int():
    data = [
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6])
    ]
    return pa.Table.from_arrays(data, names=['column1', 'column2'])


def create_table_dict():
    keys = pa.array(["x", "y", "z"], type=pa.utf8())
    data = [
        pa.chunked_array([
            pa.DictionaryArray.from_arrays([0, 1, 2], keys),
            pa.DictionaryArray.from_arrays([0, 1, 2], keys)
        ]),
        pa.chunked_array([
            pa.DictionaryArray.from_arrays([1, 1, 1], keys),
            pa.DictionaryArray.from_arrays([2, 2, 2], keys)
        ])
    ]
    return pa.Table.from_arrays(data, names=['column1', 'column2'])

class FlightServer(fl.FlightServerBase):

    def __init__(self, location="grpc://0.0.0.0:8815", **kwargs):
        super(FlightServer, self).__init__(location, **kwargs)

        self.tables = {
            b'table_int': create_table_int(),
            b'table_dict': create_table_dict(),
        }

    def do_get(self, context, ticket):
        table = self.tables[ticket.ticket]
        return fl.RecordBatchStream(table)
        # return fl.GeneratorStream(table.schema, table.to_batches(max_chunksize=1024))

def main():
    FlightServer().serve()

if __name__ == '__main__':
    main()

client.py

import argparse
import sys

import pyarrow as pa
import pyarrow.flight as fl

def get_by_ticket(args, client):
    ticket_name = args.name
    response = client.do_get(fl.Ticket(ticket_name)).read_all()
    print_response(response)

def get_by_ticket_pandas(args, client):
    ticket_name = args.name
    response = client.do_get(fl.Ticket(ticket_name)).read_pandas()
    print_response(response)

def print_response(data):
    print("=== Response ===")
    print(data)
    print("================")

def main():
    parser = argparse.ArgumentParser()
    subcommands = parser.add_subparsers()

    cmd_get_by_t = subcommands.add_parser('get_by_ticket')
    cmd_get_by_t.set_defaults(action='get_by_ticket')
    cmd_get_by_t.add_argument('-n', '--name', type=str, help="Name of the ticket to fetch.")

    cmd_get_by_tp = subcommands.add_parser('get_by_ticket_pandas')
    cmd_get_by_tp.set_defaults(action='get_by_ticket_pandas')
    cmd_get_by_tp.add_argument('-n', '--name', type=str, help="Name of the ticket to fetch.")

    args = parser.parse_args()
    if not hasattr(args, 'action'):
        parser.print_help()
        sys.exit(1)

    commands = {
        'get_by_ticket': get_by_ticket,
        'get_by_ticket_pandas': get_by_ticket_pandas,
    }

    client = fl.connect("grpc://0.0.0.0:8815")

    commands[args.action](args, client)


if __name__ == '__main__':
    main()

I'm running the server in a k8s cluster accessed through a service, with various other pods making calls to the server. This works fine EXCEPT when a second call is made to the server before the first call returns. In that case I'm not getting the proper response from the first call, but I don't seem to be getting any errors either. I'm not sure what the proper term is but is there a way to make the server "blocking" so it finishes processing the first call before it starts the second, or some other way of fixing this?

Wytrzymały Wiktor
  • 11,492
  • 5
  • 29
  • 37
ajp619
  • 670
  • 7
  • 11
  • 1
    I am also trying to understand how concurrency works for Apache Arrow Flight. Were you able to figure this out? – Rhubarb Sep 07 '21 at 22:33

0 Answers0