0

I have around 50 CSV file having data of around 60 million to process. But I don't want all these file to be merged into single CSV. Instead want to merge in small chunk of files

Example - I want to process first three file and merge into single CSV. Then move to next three files.

Currently, I am using Spring batch MultiResourceItemReader to read all files and merge unto single file using flatItemWriter

PAA
  • 1
  • 46
  • 174
  • 282

1 Answers1

0

Yes. You can create a chunk-oriented step with a chunk size of 3 where items are of type org.springframework.core.io.Resource. You can use the ResourcesItemReader to read files and a custom item writer to merge them as needed.

Here is a quick example:

import java.util.Arrays;

import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobParameters;
import org.springframework.batch.core.JobParametersBuilder;
import org.springframework.batch.core.configuration.annotation.EnableBatchProcessing;
import org.springframework.batch.core.configuration.annotation.JobBuilderFactory;
import org.springframework.batch.core.configuration.annotation.StepBuilderFactory;
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.ItemWriter;
import org.springframework.batch.item.file.ResourcesItemReader;
import org.springframework.batch.item.support.ListItemReader;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.io.Resource;

@Configuration
@EnableBatchProcessing
public class SO72493462 {

    @Bean
    public ItemReader<Resource> itemReader(@Value("#{jobParameters['inputFiles']}") Resource[] resources) {
        ResourcesItemReader resourcesItemReader = new ResourcesItemReader();
        resourcesItemReader.setResources(resources);
        return resourcesItemReader;
    }

    @Bean
    public ItemWriter<Resource> itemWriter() {
        return items -> {
            // merge resources here (the list will contain at most 3 files at a time, see chunkSize)
        };
    }

    @Bean
    public Job job(JobBuilderFactory jobs, StepBuilderFactory steps) {
        return jobs.get("job")
                .start(steps.get("step")
                        .<Resource, Resource>chunk(3)
                        .reader(itemReader(null))
                        .writer(itemWriter())
                        .build())
                .build();
    }

    public static void main(String[] args) throws Exception {
        ApplicationContext context = new AnnotationConfigApplicationContext(SO72493462.class);
        JobLauncher jobLauncher = context.getBean(JobLauncher.class);
        Job job = context.getBean(Job.class);
        JobParameters jobParameters = new JobParametersBuilder()
                .addString("inputFiles", "/data/input*.csv")
                .toJobParameters();
        jobLauncher.run(job, jobParameters);
    }

}

Mahmoud Ben Hassine
  • 28,519
  • 3
  • 32
  • 50