5

I have read about partitioning in spring-batch I've found an example which demonstrates partitioning. The example reads persons from CSV files, does some processing and insert data into the database. So at this example 1 partitioning = 1 file and so partitioner implementation looks like this:

public class MultiResourcePartitioner implements Partitioner {

    private final Logger logger = LoggerFactory.getLogger(MultiResourcePartitioner.class);
    public static final String FILE_PATH = "filePath";

    private static final String PARTITION_KEY = "partition";

    private final Collection<Resource> resources;


    public MultiResourcePartitioner(Collection<Resource> resources) {
        this.resources = resources;
    }

    @Override
    public Map<String, ExecutionContext> partition(int gridSize) {
        Map<String, ExecutionContext> map = new HashMap<>(gridSize);
        int i = 0;
        for (Resource resource : resources) {
            ExecutionContext context = new ExecutionContext();
            context.putString(FILE_PATH, getPath(resource)); //Depends on what logic you want to use to split
            map.put(PARTITION_KEY + i++, context);
        }
        return map;
    }

    private String getPath(Resource resource) {
        try {
            return resource.getFile().getPath();
        } catch (IOException e) {
            logger.warn("Can't get file from from resource {}", resource);
            throw new RuntimeException(e);
        }
    }
}

But what if I have single 10TB file? Does spring batch allow to partition it in some way?

update:

I tried following approach to achieve what I want:

make 2 steps - first step to divide file into pieces and second step to process pieces we got after the first step:

@Configuration
public class SingleFilePartitionedJob {

    @Autowired
    private JobBuilderFactory jobBuilderFactory;

    @Autowired
    private StepBuilderFactory stepBuilderFactory;

    @Autowired
    private ToLowerCasePersonProcessor toLowerCasePersonProcessor;

    @Autowired
    private DbPersonWriter dbPersonWriter;

    @Autowired
    private ResourcePatternResolver resourcePatternResolver;

    @Value("${app.file-to-split}")
    private Resource resource;


    @Bean
    public Job splitFileProcessingJob() throws IOException {
        return jobBuilderFactory.get("splitFileProcessingJob")
                .incrementer(new RunIdIncrementer())
                .flow(splitFileIntoPiecesStep())
                .next(csvToDbLowercaseMasterStep())
                .end()
                .build();
    }

    private Step splitFileIntoPiecesStep() throws IOException {
        return stepBuilderFactory.get("splitFile")
                .tasklet(new FileSplitterTasklet(resource.getFile()))
                .build();
    }

    @Bean
    public Step csvToDbLowercaseMasterStep() throws IOException {
        MultiResourcePartitioner partitioner = new MultiResourcePartitioner();
        partitioner.setResources(resourcePatternResolver.getResources("split/*.csv"));
        return stepBuilderFactory.get("csvReaderMasterStep")
                .partitioner("csvReaderMasterStep", partitioner)
                .gridSize(10)
                .step(csvToDataBaseSlaveStep())
                .taskExecutor(jobTaskExecutorSplitted())
                .build();
    }

    @Bean
    public Step csvToDataBaseSlaveStep() throws MalformedURLException {
        return stepBuilderFactory.get("csvToDatabaseStep")
                .<Person, Person>chunk(50)
                .reader(csvPersonReaderSplitted(null))
                .processor(toLowerCasePersonProcessor)
                .writer(dbPersonWriter)
                .build();

    }

    @Bean
    @StepScope
    public FlatFileItemReader csvPersonReaderSplitted(@Value("#{stepExecutionContext[fileName]}") String fileName) throws MalformedURLException {
        return new FlatFileItemReaderBuilder()
                .name("csvPersonReaderSplitted")
                .resource(new UrlResource(fileName))
                .delimited()
                .names(new String[]{"firstName", "lastName"})
                .fieldSetMapper(new BeanWrapperFieldSetMapper<Person>() {{
                    setTargetType(Person.class);
                }})
                .build();

    }

    @Bean
    public TaskExecutor jobTaskExecutorSplitted() {
        ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
        taskExecutor.setMaxPoolSize(30);
        taskExecutor.setCorePoolSize(25);
        taskExecutor.setThreadNamePrefix("cust-job-exec2-");
        taskExecutor.afterPropertiesSet();
        return taskExecutor;
    }

}

tasklet:

public class FileSplitterTasklet implements Tasklet {
    private final Logger logger = LoggerFactory.getLogger(FileSplitterTasklet.class);
    private File file;

    public FileSplitterTasklet(File file) {
        this.file = file;
    }

    @Override
    public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception {
        int count = FileSplitter.splitTextFiles(file, 100);
        logger.info("File was split on {} files", count);
        return RepeatStatus.FINISHED;

    }
}

logic for splitting file:

  public static int splitTextFiles(File bigFile, int maxRows) throws IOException {    
        int fileCount = 1;
        try (BufferedReader reader = Files.newBufferedReader(Paths.get(bigFile.getPath()))) {
            String line = null;
            int lineNum = 1;
            Path splitFile = Paths.get(bigFile.getParent() + "/" + fileCount + "split.txt");
            BufferedWriter writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);

            while ((line = reader.readLine()) != null) {

                if (lineNum > maxRows) {
                    writer.close();
                    lineNum = 1;
                    fileCount++;
                    splitFile = Paths.get("split/" + fileCount + "split.txt");
                    writer = Files.newBufferedWriter(splitFile, StandardOpenOption.CREATE);
                }

                writer.append(line);
                writer.newLine();
                lineNum++;
            }
            writer.close();
        }

        return fileCount;
    }

So I put all file pieces to the special directory.

But this doesn't work because on the moment of context initialization folder /split does not exist yet.

update

I've generated workaround which works:

public class MultiResourcePartitionerWrapper implements Partitioner {
    private final MultiResourcePartitioner multiResourcePartitioner = new MultiResourcePartitioner();
    private final ResourcePatternResolver resourcePatternResolver;
    private final String pathPattern;

    public MultiResourcePartitionerWrapper(ResourcePatternResolver resourcePatternResolver, String pathPattern) {
        this.resourcePatternResolver = resourcePatternResolver;
        this.pathPattern = pathPattern;
    }

    @Override
    public Map<String, ExecutionContext> partition(int gridSize) {
        try {
            Resource[] resources = resourcePatternResolver.getResources(pathPattern);
            multiResourcePartitioner.setResources(resources);
            return multiResourcePartitioner.partition(gridSize);

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

But it looks ugly. Is it a correct solution?

Pete Ythong
  • 305
  • 5
  • 13
gstackoverflow
  • 36,709
  • 117
  • 359
  • 710
  • Could you please guide me here: https://stackoverflow.com/questions/61827296/spring-batch-how-to-reads-5-million-records-in-faster-ways? –  May 16 '20 at 12:21

1 Answers1

1

Spring batch allow you to partition, but it's up to you how to do it.

You can simply split your 10TB file in the partitioner class (by number or by max rows), and each partion reads one splitted file. You can find a lot of example of how to split large file in java. split very large text file by max rows