parallel files download from ftp

Question

When i try to download files from ftp sequentially it works perfectly

import ftplib
import os
import logging

class pyFTPDownload(object):
    def __init__(self,
                 remote_host=None,
                 port=None,
                 username=None,
                 passwd=None,
                 input_folder=None,
                 output_folder=None,
                 ftp_conn_id=None,
                 timeout=10
                 ):
        super(pyFTPDownload, self).__init__()
        self.remote_host = remote_host
        self.port = port
        self.username = username
        self.passwd = passwd
        self.input_folder = input_folder
        self.output_folder = output_folder
        self.ftp_conn_id = ftp_conn_id
        self.timeout = timeout
        self.client = None

    def get_conn(self):
        if not self.client:
            logging.info('creating ftp client for conn_id: {0}'.format(self.ftp_conn_id))

            if not self.username:
                raise Exception("Missing required param: username")
            if not self.passwd:
                raise Exception("Missing required param: passwd")
            if not self.remote_host:
                raise Exception("Missing required param: remote_host")
            if not self.ftp_conn_id:
                self.ftp_conn_id = str(self.username) + '@' + str(self.remote_host) + ":" + (
                    str(self.port) if self.port else "")

            try:
                client = ftplib.FTP()
                client.connect(self.remote_host, (self.port if not self.port else None))
                client.login(self.username, self.passwd)
                self.client = client
            except ftplib.all_errors as remote_host_error:
                logging.error("Auth failed while connecting to host: {0}, error: {1}"
                              .format(self.remote_host, remote_host_error))
            except Exception as error:
                logging.error("Error connecting to host: {0}, error: {1}"
                              .format(self.remote_host, error))
        return self.client


    def get_file(self, input_folder, output_folder=None,file_ext=None,thread_nbr=1):

        #os.chdir(output_folder)
        ftp = self.get_conn()
        #print ftp.dir(input_folder)
        logging.debug(input_folder)
        if not os.path.exists(input_folder):
            try:
                os.makedirs(output_folder, 0755)
                logging.debug("{0} is created".format(output_folder))
            except OSError, e:
                logging.error("ERROR: {0}".format(e))

        logging.info(output_folder)
        ftp.cwd(input_folder)
        for filename in ftp.nlst(file_ext):  # Loop - looking for matching files
            try:
                logging.debug("filename {0}".format(filename))
                local_file=os.path.join(output_folder,filename)
                logging.debug("local_file {0}".format(local_file))
                with open(local_file, 'wb') as fhandle:
                    logging.debug('Getting ' + filename)  # for confort sake, shows the file that's being retrieved
                    ftp.retrbinary('RETR ' + filename, fhandle.write)
                    fhandle.close()
            except Exception as e:
                logging.error("could not download file:{0}, terminated with error {1}".format(filename,e))

But when i tried to do the same thing in parallel i get

[Errno 9] Bad file descriptor

or when i try to decomment the following two lines

xftp.connect(self.remote_host,self.port)
xftp.login(self.username,self.passwd)

i get this error: a float is required. But without a stacktrace so i can debug it

my full code follow:

    # coding=utf-8
from itertools import izip, repeat
import ftplib
import os
import multiprocessing
from pathos.multiprocessing import ProcessingPool as Pool
import logging
import traceback


class pyFTPDownload(object):
    def __init__(self,
                 remote_host,
                 port,
                 username,
                 passwd,
                 ftp_conn_id=None
                 ):
        super(pyFTPDownload, self).__init__()
        self.remote_host = remote_host
        self.port = port
        self.username = username
        self.passwd = passwd
        self.ftp_conn_id = ftp_conn_id
        self.client = None
        if not self.client:
            logging.info('creating ftp client for conn_id: {0}'.format(self.ftp_conn_id))

            if not self.username:
                raise Exception("Missing required param: username")
            if not self.passwd:
                raise Exception("Missing required param: passwd")
            if not self.remote_host:
                raise Exception("Missing required param: remote_host")
            if not self.ftp_conn_id:
                self.ftp_conn_id = str(self.username) + '@' + str(self.remote_host) + ":" + (
                    str(self.port) if self.port else "")

            try:
                client = ftplib.FTP()
                client.connect(self.remote_host, (self.port if self.port else None))
                client.login(self.username, self.passwd)
                self.client = client
            except ftplib.all_errors as remote_host_error:
                logging.error("Auth failed while connecting to host: {0}, error: {1}"
                              .format(self.remote_host, remote_host_error))
            except Exception as error:
                logging.error("Error connecting to host: {0}, error: {1}"
                              .format(self.remote_host, error))


    def get_conn(self):
        return self.client


class loadData(pyFTPDownload):
    def __init__(self,
                 remote_host,
                 port,
                 username,
                 passwd,
                 input_folder,
                 output_folder,
                 file_ext=None,
                 nbr_processes=None,
                 ftp_conn_id = None):
        super(loadData, self).__init__(remote_host,port,username,passwd)
        self.input_folder=input_folder
        self.output_folder=output_folder
        self.file_ext=file_ext
        self.nbr_processes=nbr_processes


        if not input_folder:
            raise Exception("Missing required params: input_folder")
        if not output_folder:
            raise Exception("Missing required params: output_folder")
        if not file_ext:
            logging.warn("All the existing files in {0} will be considered".format(input_folder))
        if not nbr_processes:
            logging.warn("The number of processes to be started will be set to {0}".format(Pool.ncpus))
            self.nbr_processes=multiprocessing.cpu_count()


    def downloadfunc(self,a):
        return self.downloadf(*a)

    def downloadf(self, inputf, filename, outputf):

        global xftp
        global local_file
        global fhandle
        print filename
        try:
            xftp = self.get_conn()
            xftp.connect(self.remote_host,self.port)
            xftp.login(self.username,self.passwd)
            print xftp
        except ftplib.all_errors as remote_host_error:
            logging.error("Auth failed while connecting to host: {0}, error: {1}"
                          .format(self.remote_host, remote_host_error))
        except Exception as error:
            logging.error("Error connecting to host: {0}, error: {1}"
                          .format(self.remote_host, error))

        try:
            logging.debug("filename {0}".format(filename))
            local_file = os.path.join(outputf, filename)
            logging.debug("local_file {0}".format(local_file))
        except Exception as sd:
            logging.error("Unkmown error: {}".format(sd))
        xftp.cwd(inputf)
        try:
           with open(local_file, 'wb') as fhandle:
                logging.debug('Getting ' + filename)  # for confort sake, shows the file that's being retrieved
                xftp.retrbinary('RETR ' + filename, fhandle.write)
                fhandle.close()
        except Exception as k:
          logging.error("Could not download {0} : {1}".format(local_file,k))
        finally:
            xftp.quit()
        print traceback


    def get_file(self):
        print "PREPARING FILE DOWNLOAD"
        print self.output_folder
        if not os.path.exists(self.output_folder):
            try:
                logging.debug("{} does not exists".format(self.output_folder))
                os.makedirs(self.output_folder,0755)
                logging.debug("{0} is created".format(self.output_folder))
            except OSError, e:
                logging.error("ERROR:{0} could not be created {1}, {2}".format(self.output_folder,e,OSError))
            except Exception as d:
                logging.error(d)
        ftpObj=self.get_conn()
        ftpObj.cwd(self.input_folder)
        files_to_dl=ftpObj.nlst(self.file_ext)
        p = Pool(self.nbr_processes) 
        try:
            p.map(self.downloadfunc, izip(repeat(self.input_folder),files_to_dl,repeat(self.output_folder)))
            p.close()
            p.join()

        except Exception as f:
            logging.error(f)

I do not have a lot of experience with python, so it would be very nice of you to check my code. I have also some questions what is the best way to implement multiprocessing in this case?

I have to ask, why are you doing this? The constraint on an FTP download is almost always the connection speed between client and server. If you "multiprocess", you're likely only adding context switching overhead and possibly defeating TCP's throughput maximization. Why do you expect multiplexing to produce higher throughput than sequential transfers? — James K. Lowden, Mar 08 '17 at 20:34
@JamesK.Lowden I am trying to donwload more than 250.000 files each day. Do you that this is not a use case for multiprocess downloading? — sdikby, Mar 08 '17 at 20:45
It could be 250,000, or it could be 25. Once the pipeline is full, it's full. Why do you expect multiplexing to produce higher throughput? Have you measured the throughput and compared that to the advertised bandwidth? How many simultaneous downloads will your connection support before it's saturated? — James K. Lowden, Mar 08 '17 at 20:57
@JamesK.Lowden we use multiprocess ftp file download but with java, and there is a big difference when i start the tool with 1 (sequential download )process versus 10 for example. The idea is now to write the same code logic with python as there is nobody to maintain the java code. FYI our server was tested with 20 simultaneous downloads — sdikby, Mar 08 '17 at 21:11
@JamesK.Lowden you were right about it. After i found the root of my problem and fixed it. I ran a performance test on about 11000 files. Conclusion: 1- For pure (without on-the-fly processing on them) files downloading sequnetial download is far more faster than parallel,0,45 minute in comparaison of 26 minutes for parallel downloading. 2- After the task is finished my computer seems to be very slow (seq downloading), that didn't happen with parallel DL. — sdikby, Mar 09 '17 at 15:08
QUESTIONS: a- Can you please give an explanation for 1 and 2 b- If i try to download the same amount of files in parallel but , while downloading i do some processing on them like running some compression/decompression algorithm on each of them. Would it be more pewerfull as sequential DL ? — sdikby, Mar 09 '17 at 15:08
1. By creating a large number of parallel threads, you're creating contention for a fixed resource: network bandwidth. Both ends attempt to meet the demand, and end up not being able to make full use of the connection. 2. To shorten end-to-end time, yes, you can process each file after it's received. For example, after file 1 is received, spawn a thread (say) to process it, and immediately begin transferring file 2. Probably I'd have two processes: 1 for ftp and 1 for subsequent processing, but there's more than one way to skin a cat. HTH. — James K. Lowden, Mar 10 '17 at 00:31

score 2 · Accepted Answer · answered Mar 09 '17 at 14:30

I have found the problem with my code. In the downloadf function and exactly in this code part

try:
        xftp = self.get_conn()
        xftp.connect(self.remote_host,self.port)
        xftp.login(self.username,self.passwd)
        print xftp
    except ftplib.all_errors as remote_host_error:
        logging.error("Auth failed while connecting to host: {0}, error: {1}"
                      .format(self.remote_host, remote_host_error))
    except Exception as error:
        logging.error("Error connecting to host: {0}, error: {1}"
                      .format(self.remote_host, error))

The problem was that i used the same ftp connection instance through, xftp = self.get_conn() and that is why i got errors that didn't seem to describe the source problem, how matter what changes did i introduce to my code to fix that. The solution is to instantiate a new ftp connection for each process, so instead of the above line of code i do the folloying now: xftp = ftplib.FTP()

Maybe there is a kind soul that could bring a more 'pythonic' explanation to this issue. I would be gratefull

parallel files download from ftp

1 Answers1