0

I am creating in memory PCollection and writing it into postgres sql. now, when I insert data into table, few records may throw exception and will not be inserted. how to extract such failed insert records when I start pipeline?

below is the code I have written for pipeline:

PipelineOptions options = PipelineOptionsFactory.create();
     
    options.setRunner(FlinkRunner.class);
    Pipeline p = Pipeline.create(options);

    // Preparing dummy data
    Collection<Stock> stockList = Arrays.asList(new Stock("AAP", 2000,"Apple Inc"),
            new Stock("MSF", 3000, "Microsoft Corporation"),
            new Stock("NVDA", 4000, "NVIDIA Corporation"),
            new Stock("INT", 3200, "Intel Corporation"));
    
    // Reading dummy data and save it into PCollection<Stock>
    PCollection<Stock> data = p.apply(Create.of(stockList)
                               .withCoder(SerializableCoder.of(Stock.class)));
    //insert
    @SuppressWarnings("unused")
    PDone insertData =  data.apply(JdbcIO.<Stock>write()
                                
                    .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration
                            .create("org.postgresql.Driver","jdbc:postgresql://localhost:5432/postgres")
                            .withUsername("postgres").withPassword("sachin"))           
                    .withStatement("insert into stocks values(?, ?, ?)")
                    .withPreparedStatementSetter(new JdbcIO.PreparedStatementSetter<Stock>() {
                        private static final long serialVersionUID = 1L;
                        public void setParameters(Stock element, PreparedStatement query) throws SQLException {
                            query.setString(1, element.getSymbol());
                            query.setLong(2, element.getPrice());
                            query.setString(3, element.getCompany());
                        }
        
                    }));
    
    
        p.run().waitUntilFinish();
    
  • My recommendation to create the error handling would be to create another table and store all the errors that you receive from the different values in there, so every error that is created would have a space in a table and you can review them without a problem. – Eduardo Ortiz Nov 23 '21 at 20:52
  • @EduardoOrtiz how to do error handling with jdbcio. it does not provide any way to get error records while insert happening – Sachin Rane Nov 24 '21 at 03:26

1 Answers1

1

After going through all apache beam programming guide, i did not get any clue, So, copied JdbcIO and modified execute batch where I have separated inserted successful record and insert failed record by using TupleTags. now, It is working.

below is code for modified JdbcIO:

  private static class WriteFn<T> extends DoFn<T, T> {
  private static final int DEFAULT_BATCH_SIZE = 1;

  private final Write<T> spec;

  private DataSource dataSource;
  private Connection connection;
  private PreparedStatement preparedStatement;
  **private TupleTag<T> validTupleTag;
  private TupleTag<T> inValidTupleTag;**
  private int batchCount;

  public WriteFn(Write<T> spec) {
    this.spec = spec;
  }

  @Setup
  public void setup() throws Exception {
    dataSource = spec.getDataSourceConfiguration().buildDatasource();
    connection = dataSource.getConnection();
    connection.setAutoCommit(false);
    preparedStatement = connection.prepareStatement(spec.getStatement());
    validTupleTag = spec.getValidTupleTag();
    inValidTupleTag = spec.getInvalidTupleTag();
  }

  @StartBundle
  public void startBundle() {
    batchCount = 0;
  }
  
  @ProcessElement
  public void processElement(@Element T record, MultiOutputReceiver out) 
  throws Exception {
    preparedStatement.clearParameters();
    spec.getPreparedStatementSetter().setParameters(record, 
   preparedStatement);
    preparedStatement.addBatch();

    batchCount++;

    if (batchCount >= DEFAULT_BATCH_SIZE) {
        if (batchCount > 0) {
            try {
            preparedStatement.executeBatch();
            connection.commit();
            **out.get(validTupleTag).output(record);**
        } catch (SQLException e1) {
            //TODO add logger
            **out.get(inValidTupleTag).output(record);**
        }
            batchCount = 0;
          }
    }
  }

and client code:

import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import org.apache.beam.runners.flink.FlinkRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult.State;
import org.apache.beam.sdk.coders.SerializableCoder;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.TupleTag;

/**
 * @author sachin
 * @date 18-Nov-2021
*/

public class BeamTest {
static List<Stock> stocks = new ArrayList<>();

public static void main(String[] args) {
    System.setProperty("java.specification.version", "1.8");
    process();
    // read();

}

public static void process() {
    final TupleTag<Stock> VALID = new TupleTag<Stock>() {
    };
    final TupleTag<Stock> INVALID = new TupleTag<Stock>() {
    };

    PipelineOptions options = PipelineOptionsFactory.create();

    options.setRunner(FlinkRunner.class);
    Pipeline p = Pipeline.create(options);

    // Preparing dummy data
    Collection<Stock> stockList = Arrays.asList(
            new Stock("AAP", 2000, "Apple Inc"),
            new Stock("MSF", 3000, "Microsoft Corporation"), 
            new Stock("NVDA", 4000, "NVIDIA Corporation"),
            new Stock("INT", 3200, "Intel Corporation"));

    // Reading dummy data and save it into PCollection<Stock>
    PCollection<Stock> data = 




  p.apply(Create.of(stockList).
  withCoder(SerializableCoder.of(Stock.class)));
    // insert
    PCollectionTuple pCollectionTupleResult = data.apply("write", 
 CustomJdbcIOWrite.<Stock>write()

                     
.withDataSourceConfiguration(CustomJdbcIOWrite.DataSourceConfiguration
                    .create("org.postgresql.Driver", 
 "jdbc:postgresql://localhost:5432/postgres")
                    .withUsername("postgres").withPassword("sachin"))
                    .withStatement("insert into stocks values(?, ?, 
 ?)").withValidTag(VALID).withInValidTag(INVALID)
                    .withPreparedStatementSetter(new 
 CustomJdbcIOWrite.PreparedStatementSetter<Stock>() {
                        private static final long serialVersionUID = 1L;
    
                        public void setParameters(Stock element, 
 PreparedStatement query) throws SQLException {
                            query.setString(1, element.getSymbol());
                            query.setLong(2, element.getPrice());
                            query.setString(3, element.getCompany());
                        }

                    }));
    // get failed PCollection using INVALID tupletag
    PCollection<Stock> failedPcollection = 
    pCollectionTupleResult.get(INVALID)
            .setCoder(SerializableCoder.of(Stock.class));
        
    failedPcollection.apply(ParDo.of(new DoFn<Stock, Stock>() {

        private static final long serialVersionUID = 1L;

        @ProcessElement
        public void process(ProcessContext pc) {
            System.out.println("Failed pCollection element:" + 
        pc.element().getCompany());
        }

    }));

    //get failed PCollection using INVALID tupletag
    PCollection<Stock> insertedPcollection = 
    pCollectionTupleResult.get(VALID)
            .setCoder(SerializableCoder.of(Stock.class));
    insertedPcollection.apply(ParDo.of(new DoFn<Stock, Stock>() {

        private static final long serialVersionUID = 1L;
        
        @ProcessElement
        public void process(ProcessContext pc) {
            System.out.println("Inserted pCollection element:" + 
        pc.element().getCompany());
        }
        
    }));
    
    // run pipeline
    State state = p.run().waitUntilFinish();
    System.out.println("Data inserted successfully with state : " + 
    state);

   }

 }

below is the output as new Stock("NVDA", 4000, "NVIDIA Corporation") is intentianlly not inserted as my db column accept only 3 char "NVD" and not 4 chars "NVDA":

Inserted pCollection element:Microsoft Corporation
Failed pCollection element:NVIDIA Corporation
Inserted pCollection element:Intel Corporation
Inserted pCollection element:Apple Inc
Data inserted successfully with state : DONE

Full Details and github link