1

During ingest json data from kafka and save them as parquet files which be loaded into hive, I met the same issue mentioned in Flink BucketingSink with Custom AvroParquetWriter create empty file . Does anyone know how to resolve it? Thank you. I used Apache Flink 1.4.0 + HDFS 2.7.3

Casel Chen
  • 497
  • 2
  • 8
  • 19

1 Answers1

2

You can directly implement the Writer interface. It could look the following way:

import org.apache.flink.util.Preconditions;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;

import java.io.IOException;

/**
 * Parquet writer.
 *
 * @param <T>
 */
public class ParquetSinkWriter<T extends GenericRecord> implements Writer<T> {

    private static final long serialVersionUID = -975302556515811398L;

    private final CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY;
    private final int pageSize = 64 * 1024;

    private final String schemaRepresentation;

    private transient Schema schema;
    private transient ParquetWriter<GenericRecord> writer;
    private transient Path path;

    private int position;

    public ParquetSinkWriter(String schemaRepresentation) {
        this.schemaRepresentation = Preconditions.checkNotNull(schemaRepresentation);
    }

    @Override
    public void open(FileSystem fs, Path path) throws IOException {
        this.position = 0;
        this.path = path;

        if (writer != null) {
            writer.close();
        }

        writer = createWriter();
    }

    @Override
    public long flush() throws IOException {
        Preconditions.checkNotNull(writer);
        position += writer.getDataSize();
        writer.close();
        writer = createWriter();

        return position;
    }

    @Override
    public long getPos() throws IOException {
        Preconditions.checkNotNull(writer);
        return position + writer.getDataSize();
    }

    @Override
    public void close() throws IOException {
        if (writer != null) {
            writer.close();
            writer = null;
        }
    }

    @Override
    public void write(T element) throws IOException {
        Preconditions.checkNotNull(writer);
        writer.write(element);
    }

    @Override
    public Writer<T> duplicate() {
        return new ParquetSinkWriter<>(schemaRepresentation);
    }

    private ParquetWriter<GenericRecord> createWriter() throws IOException {
        if (schema == null) {
            schema = new Schema.Parser().parse(schemaRepresentation);
        }

        return AvroParquetWriter.<GenericRecord>builder(path)
            .withSchema(schema)
            .withDataModel(new GenericData())
            .withCompressionCodec(compressionCodecName)
            .withPageSize(pageSize)
            .build();
    }
}
Till Rohrmann
  • 13,148
  • 1
  • 25
  • 51
  • Thanks for your suggestion. But I still met the following exception. Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: /wikipedia-edits-flink/partitionKey=2018-01-10--12-10/_part-5-0.in-progress for client 127.0.0.1 already exists ... at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:2563) ... at flink.ParquetSinkWriter.createWriter(ParquetSinkWriter.java:98) at flink.ParquetSinkWriter.flush(ParquetSinkWriter.java:58) at org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink.snapshotState(BucketingSink.java:688) – Casel Chen Jan 10 '18 at 04:11
  • Can it write parquet file without avro schema but through Apache Flink's DataFrame[T] type infer that? – Casel Chen Jan 11 '18 at 10:22
  • I think you can get the `Schema` from the `AvroSerializer` which you can instantiate via the `AvroTypeInfo`. – Till Rohrmann Jan 11 '18 at 14:23
  • @till reading this implementation and the bucketingSink implementation, is it correct to say that a hard failure will cause a complete loss of all the data currently in the parquetWriter cache? I guess this as not flushing data and not storing it in any state makes it impossible for the job to restore it... – enrico Feb 17 '20 at 09:46