0

I am creating a DataFusion pipeline to ingest a CSV file from s3 bucket, applying wrangler directives and storing it in GCS bucket. The input CSV file had 18 columns. However, the output CSV file has only 8 columns. I have a doubt that this could be due to the CSV encoding format, but I am not sure. What could be the reason here?

Pipeline JSON

{
"name": "aws_fusion_v1",
"description": "Data Pipeline Application",
"artifact": {
    "name": "cdap-data-pipeline",
    "version": "6.1.2",
    "scope": "SYSTEM"
},
"config": {
    "resources": {
        "memoryMB": 2048,
        "virtualCores": 1
    },
    "driverResources": {
        "memoryMB": 2048,
        "virtualCores": 1
    },
    "connections": [
        {
            "from": "Amazon S3",
            "to": "Wrangler"
        },
        {
            "from": "Wrangler",
            "to": "GCS2"
        },
        {
            "from": "Argument Setter",
            "to": "Amazon S3"
        }
    ],
    "comments": [],
    "postActions": [],
    "properties": {},
    "processTimingEnabled": true,
    "stageLoggingEnabled": true,
    "stages": [
        {
            "name": "Amazon S3",
            "plugin": {
                "name": "S3",
                "type": "batchsource",
                "label": "Amazon S3",
                "artifact": {
                    "name": "amazon-s3-plugins",
                    "version": "1.11.0",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "format": "text",
                    "authenticationMethod": "Access Credentials",
                    "filenameOnly": "false",
                    "recursive": "false",
                    "ignoreNonExistingFolders": "false",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}",
                    "referenceName": "aws_source",
                    "path": "${input.bucket}",
                    "accessID": "${input.access_id}",
                    "accessKey": "${input.access_key}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}"
                }
            ],
            "type": "batchsource",
            "label": "Amazon S3",
            "icon": "icon-s3"
        },
        {
            "name": "Wrangler",
            "plugin": {
                "name": "Wrangler",
                "type": "transform",
                "label": "Wrangler",
                "artifact": {
                    "name": "wrangler-transform",
                    "version": "4.1.5",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "field": "*",
                    "precondition": "false",
                    "threshold": "1",
                    "workspaceId": "804a2995-7c06-4ab2-b342-a9a01aa03a3d",
                    "schema": "${output.schema}",
                    "directives": "${directive}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "${output.schema}"
                }
            ],
            "inputSchema": [
                {
                    "name": "Amazon S3",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}"
                }
            ],
            "type": "transform",
            "label": "Wrangler",
            "icon": "icon-DataPreparation"
        },
        {
            "name": "GCS2",
            "plugin": {
                "name": "GCS",
                "type": "batchsink",
                "label": "GCS2",
                "artifact": {
                    "name": "google-cloud",
                    "version": "0.14.2",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "project": "auto-detect",
                    "suffix": "yyyy-MM-dd-HH-mm",
                    "format": "csv",
                    "serviceFilePath": "auto-detect",
                    "location": "us",
                    "referenceName": "gcs_sink",
                    "path": "${output.path}",
                    "schema": "${output.schema}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "${output.schema}"
                }
            ],
            "inputSchema": [
                {
                    "name": "Wrangler",
                    "schema": ""
                }
            ],
            "type": "batchsink",
            "label": "GCS2",
            "icon": "fa-plug"
        },
        {
            "name": "Argument Setter",
            "plugin": {
                "name": "ArgumentSetter",
                "type": "action",
                "label": "Argument Setter",
                "artifact": {
                    "name": "argument-setter-plugins",
                    "version": "1.1.1",
                    "scope": "USER"
                },
                "properties": {
                    "method": "GET",
                    "connectTimeout": "60000",
                    "readTimeout": "60000",
                    "numRetries": "0",
                    "followRedirects": "true",
                    "url": "${argfile}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": ""
                }
            ],
            "type": "action",
            "label": "Argument Setter",
            "icon": "fa-plug"
        }
    ],
    "schedule": "0 * * * *",
    "engine": "spark",
    "numOfRecordsPreview": 100,
    "description": "Data Pipeline Application",
    "maxConcurrentRuns": 1
}
}

Edit: The missing columns in the output file were due to spaces in the column names. But I am facing another issue. In wrangler, when I pass a directive as "parse-as-csv :body ',' false", the output file is empty. But when I pass something like "parse-as-csv :body ',' true", the output file has all the data without header as expected.

0 Answers0