0

Im currently trying list files/directories inside of adls2 using a continuation token (currently our folder has over 5000 files). I am able to send my first request, however receive a 403 error (presumably meaning incorrect formatting) when trying to connect with the continuation token in the response, and unsure what formatting problems could be causing this error.

I have currently tried removing the = sign at the end of the key for the uri, seeing as that was the problem for someone else. I had also tried creating a header for the continuation inside of my request with no luck.

adls_request is the main function. This gets run twice, once for the initial request, second for the continuation. Currently I have the continuation set up inside the uri and signature.

def gen_signature(request_time, api_version, storage_account_name, file_system_name, storage_account_key, signature_params):
    string_params = {
        'verb': 'GET',
        'Content-Encoding': '',
        'Content-Language': '',
        'Content-Length': '',
        'Content-MD5': '',
        'Content-Type': '',
        'Date': '',
        'If-Modified-Since': '',
        'If-Match': '',
        'If-None-Match': '',
        'If-Unmodified-Since': '',
        'Range': '',
        'CanonicalizedHeaders': 'x-ms-date:' + request_time + '\nx-ms-version:' + api_version,
        'CanonicalizedResource': '/' + storage_account_name+'/'+file_system_name+signature_params
        }

    string_to_sign = (string_params['verb'] + '\n' 
                      + string_params['Content-Encoding'] + '\n'
                      + string_params['Content-Language'] + '\n'
                      + string_params['Content-Length'] + '\n'
                      + string_params['Content-MD5'] + '\n' 
                      + string_params['Content-Type'] + '\n' 
                      + string_params['Date'] + '\n' 
                      + string_params['If-Modified-Since'] + '\n'
                      + string_params['If-Match'] + '\n'
                          + string_params['If-None-Match'] + '\n'
                      + string_params['If-Unmodified-Since'] + '\n'
                      + string_params['Range'] + '\n'
                      + string_params['CanonicalizedHeaders']+'\n'
                      + string_params['CanonicalizedResource'])

    signed_string = base64.b64encode(hmac.new(base64.b64decode(storage_account_key), msg=string_to_sign.encode('utf-8'), digestmod=hashlib.sha256).digest()).decode()
    return signed_string

def create_headers(request_time, api_version, storage_account_name, signed_string):
    headers = {
        'x-ms-date' : request_time,
        'x-ms-version' : api_version,
        'Authorization' : ('SharedKey ' + storage_account_name + ':' + signed_string)
    }
    return headers

def create_url(storage_account_name, file_system_name, url_params):
    url = ('https://' + storage_account_name + '.dfs.core.windows.net/'+file_system_name+url_params)
    return url

def set_optional_params(list_dir, file_dir, token_continuation):
    if token_continuation != '':
        token_continuation_sig = '\ncontinuation:'+token_continuation
        token_continuation_url = '&continuation='+token_continuation[:-1]
    else:
        token_continuation_sig = ''
        token_continuation_url = ''
    print token_continuation_sig
    print token_continuation_url

    if list_dir:
        print type(token_continuation)
        signature_params = '\ndirectory:'+file_dir+'\nrecursive:true'+token_continuation_sig+'\nresource:filesystem'
        url_params = '?directory='+file_dir+'&recursive=true'+token_continuation_url+'&resource=filesystem'
        return signature_params, url_params
    else:
        signature_params = ''
        url_params = ''
        return signature_params, url_params


def get_request_time():
    return datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')


def adls_request(list_dir,
                 file_system_name,
                 file_dir  = '',
                 storage_account_name = 'account_name',
                 storage_account_key = '123456789==',
                 api_version = '2018-11-09',
                 token_continuation = ''):

    signature_params, url_params = set_optional_params(list_dir, file_dir, token_continuation)
    request_time = get_request_time()
    signature = gen_signature(request_time, api_version, storage_account_name, file_system_name, storage_account_key, signature_params)
    headers = create_headers(request_time, api_version, storage_account_name, signature)
    url = create_url(storage_account_name, file_system_name, url_params)
    r = requests.get(url, headers = headers)
    return r

I expect the response output to come up 200, containing the rest of the files inside the directory, but still am currently receiving 403 error.

Fastas
  • 79
  • 1
  • 8

1 Answers1

0

Please try the code below, I use python 3.7 for the test:

import requests
import datetime
import hmac
import hashlib
import base64
import urllib.parse

def gen_signature(request_time, api_version, storage_account_name, file_system_name, storage_account_key, signature_params):    
    string_params = {
        'verb': 'GET',
        'Content-Encoding': '',
        'Content-Language': '',
        'Content-Length': '',
        'Content-MD5': '',
        'Content-Type': '',
        'Date': '',
        'If-Modified-Since': '',
        'If-Match': '',
        'If-None-Match': '',
        'If-Unmodified-Since': '',
        'Range': '',
        'CanonicalizedHeaders': 'x-ms-date:' + request_time + '\nx-ms-version:' + api_version,
        'CanonicalizedResource': '/' + storage_account_name + '/' + file_system_name + signature_params,
        }

    string_to_sign = (string_params['verb'] + '\n' 
                      + string_params['Content-Encoding'] + '\n'
                      + string_params['Content-Language'] + '\n'
                      + string_params['Content-Length'] + '\n'
                      + string_params['Content-MD5'] + '\n' 
                      + string_params['Content-Type'] + '\n' 
                      + string_params['Date'] + '\n' 
                      + string_params['If-Modified-Since'] + '\n'
                      + string_params['If-Match'] + '\n'
                      + string_params['If-None-Match'] + '\n'
                      + string_params['If-Unmodified-Since'] + '\n'
                      + string_params['Range'] + '\n'
                      + string_params['CanonicalizedHeaders']+'\n'
                      + string_params['CanonicalizedResource'])

    signed_string = base64.b64encode(hmac.new(base64.b64decode(storage_account_key), msg=string_to_sign.encode('utf-8'), digestmod=hashlib.sha256).digest()).decode()
    return signed_string

def create_headers(request_time, api_version, storage_account_name, signed_string):
    headers = {
        'x-ms-date' : request_time,
        'x-ms-version' : api_version,
        'Authorization' : ('SharedKey ' + storage_account_name + ':' + signed_string)
    }
    return headers

def create_url(storage_account_name, file_system_name, url_params):
    url = ('https://' + storage_account_name + '.dfs.core.windows.net/'+file_system_name+url_params)
    return url

def set_optional_params(list_dir, file_dir, token_continuation):
    if token_continuation != '':
        token_continuation_sig = '\ncontinuation:'+ token_continuation

        #Note that since the continuation token ended with =, you should encode the token, then add to url.
        token_continuation_url = '&continuation='+urllib.parse.quote_plus(token_continuation)  
    else:
        token_continuation_sig = ''
        token_continuation_url = ''
    #print(token_continuation_sig)
    #print(token_continuation_url)

    if list_dir:
        if token_continuation !='':
            signature_params = token_continuation_sig + '\ndirectory:' + file_dir + '\nrecursive:true' + '\nresource:filesystem'
            url_params = '?directory=' + file_dir + '&recursive=true' + token_continuation_url + '&resource=filesystem'
            return signature_params, url_params
        else:
            signature_params =  '\ndirectory:' + file_dir + '\nrecursive:true' + '\nresource:filesystem'
            url_params = '?directory=' + file_dir + '&recursive=true' + '&resource=filesystem'
            return signature_params, url_params
    else:
        signature_params = ''
        url_params = ''
        return signature_params, url_params


def get_request_time():
    return datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')


def adls_request(list_dir,
                 file_system_name,
                 file_dir  = '',
                 storage_account_name = 'account_name',
                 storage_account_key = '123456789==',
                 api_version = '2018-11-09',
                 token_continuation = ''):

    signature_params,url_params = set_optional_params(list_dir, file_dir, token_continuation)

    request_time = get_request_time()
    signature = gen_signature(request_time, api_version, storage_account_name, file_system_name, storage_account_key, signature_params)
    headers = create_headers(request_time, api_version, storage_account_name, signature)
    url = create_url(storage_account_name, file_system_name, url_params)    
    print(url)
    r = requests.get(url, headers = headers)
    return r

if __name__ == '__main__':
    list_dir = True
    file_system_name ="dd1"
    file_dir="testfile"
    storage_account_name = 'xxx'
    storage_account_key = 'xxxx'
    api_version = '2018-11-09'
    token_continuation = ''
    print("******First Time without continuation token******")
    #The 1st time to get files which can be up to 5000
    r = adls_request(list_dir,file_system_name,file_dir,storage_account_name,storage_account_key,api_version,token_continuation)
    print(r)

    print("\n\n******Sencond Time with continuation token******")
    #Then 2nd time to get files with continuation token
    #when files are more than 5000, you will get a continuation token
    if 'x-ms-continuation' in r.headers:
        token_continuation=r.headers["x-ms-continuation"]
        print("continuation token: "+token_continuation)

    r = adls_request(list_dir,file_system_name,file_dir,storage_account_name,storage_account_key,api_version,token_continuation)
    print(r)

Test result:

enter image description here

I have 6000 files in the directory, and note that if you get the continuation token(if files in directory are more than 5000, otherwise no token returned), you should encode the token, then add the encoded token to the url.

This is just a simple test, and please feel free to change the code to meet your need.

Ivan Glasenberg
  • 29,865
  • 2
  • 44
  • 60
  • Awesome, encoding the token did the trick! thank you again, you've helped me out more than I can express with this. – Fastas May 24 '19 at 13:32