How to apply a schema to an uploaded csv file in a Palantir Foundry dataset using the REST API

175 Views Asked by At

When I upload a csv file into a dataset via the Foundry REST API, it's just the file, but I would like the actual dataset to be populated with the contents from the file. I can do this on the webpage using "Apply a schema", and it would do the desired effect. However, I'm looking for a programmatic solution, preferably with Python. The upload file function that I have is as follows:

def upload_file_to_foundry(
    input_file,
    token, 
    dataset_rid,
    base_url):

headers = {
    "content-type": "application/octet-stream",
    "authorization": "Bearer {}".format(token)
}

requests.post(f'{base_url}/api/v1/datasets/{dataset_rid}/files:upload?filePath=myUploadFile.csv',
                        headers=headers, data=open(input_file, 'rb'))

This successfully loads the file into Foundry, but I need to apply a schema. Some old code that I have from a colleague uses the following:

def query_foundry(url,
              token,
              headers=None,
              #attrHeaders=None,
              params=None,
              data=None,
              attrJson=None,
              files=None,
              stream=False
              ) -> (requests.Response):
''' 
purpose:        general post request used by various foundry queries
parameters:
    url         - url used for the post request
    token       - bearer token from foundry (auth token)
    headers     - set the auth and content-type for the post
    attrHeaders - for providing additional headers
    params      - send params as a query string, similar to a get request
    data        - send payload as the body of the request
    attrJson    - send payload as the body of the request, taken in form of json
    files       - send payload as part of the body with a content-type of multipart/form-data
    stream      - stream the request (default: False)
'''
# prepare and execute POST request
if not headers:
    headers = {}
    headers["Authorization"] = f'Bearer {token}'
    headers["Content-Type"] = 'application/json'

response = requests.post(url,
                         headers=headers,
                         params=params,
                         json=attrJson,
                         data=data,
                         files=files,
                         stream=stream
                         )

return response 

def csv_to_foundry_dataset(token,
                          datasetRID,
                          transactionRID,
                          fileNameUpload,
                          branch='master'
                          ):
''' 
purpose:        upload csv to foundry, commit the transaction and set 
                schema based on the foundry-schema-inference api
parameters:
    token           - bearer token from foundry (auth token)
    datasetRID      - the rid of the dataset to overwrite
    transactionRID  - the rid of the active transaction for the dataset
    fileNameUpload  - location of the csv to upload
    branch          - the branch of the target dataset (defualt: master)
'''
headers = {}
headers["Authorization"] = f'Bearer {token}'

# strip the byte order mask from the file and encode as utf-8. this is necessary
# when downloading a csv from foundry as it adds a BOM to the beginning of file      
bom_file = open(fileNameUpload, mode='r', encoding='utf-8-sig').read()
open(fileNameUpload, mode='w', encoding='utf-8').write(bom_file)

url = f"{baseUrl}/foundry-data-proxy/api/dataproxy/datasets/{datasetRID}/transactions/{transactionRID}"
files = {'upload': ('data.csv', open(fileNameUpload, 'r'), 'csv')}

# upload the csv to foundry
response = query_foundry(url=url, token=token,
                         headers=headers, files=files)

if response.ok:
    # commit the transaction
    url = f"{baseUrl}/foundry-catalog/api/catalog/datasets/{datasetRID}/transactions/{transactionRID}/commit"
    response = query_foundry(url=url, token=token, data="{}")

    # get infered schema from foundry
    url = f"{baseUrl}/foundry-schema-inference/api/datasets/{datasetRID}/branches/{branch}/schema"
    response = query_foundry(url=url, token=token, data="{}")
    c = json.loads(response.content)
    schema = c["data"]["foundrySchema"]        
    
    # apply schema to foundry dataset
    url = f"{baseUrl}/foundry-metadata/api/schemas/datasets/{datasetRID}/branches/{branch}"
    response = query_foundry(url=url, token=token, attrJson=schema)

else:
    response.raise_for_status()

return response

But this raises a 404 response on this portion:

    # get infered schema from foundry
    url = f"{baseUrl}/foundry-schema-inference/api/datasets/{datasetRID}/branches/{branch}/schema"
    response = query_foundry(url=url, token=token, data="{}")
    c = json.loads(response.content)
    schema = c["data"]["foundrySchema"]
0

There are 0 best solutions below