Best way to split log files

Question

Need help and this seems like such a common task to do: We have hourly huge logfiles containing many different events. We have been using hive to split these events to different files, in a hard coded way:

from events
  insert overwrite table specificevent1
   where events.event_type='specificevent1'
  insert overwrite table specificevent2
   where events.event_type='specificevent2'
...;

This is problematic as the code must change for each new event that we add.

We try to use dynamic partitioning to do an automatic parsing but experiencing problems:

If my partition schema is /year/month/day/hour/event then we cannot recover partitions of more than a day as the number for monthly will be ~ (30 days)(24 hours)(100~ events)=~72k which is way too many to work with.
If my schema is event/year/month/day/hour then since the event is the dynamic part it forces the next partitions to be scripted as dynamic, and this causes the splitting to take more time as number of partitions grow.

Is there a better way to do this (Hive and non-Hive solutions)?

score 0 · Answer 1 · answered May 29 '13 at 15:35

Hope this will help others...

I found that Hive is not the way to go if you want to split a logfile to many different files (file per event_type). Dynamic partitions offered by Hive have too many limitations IMHO.

What I ended up doing is writing a custom map-reduce jar. I also found the old Hadoop interface much more suitable as it offers the MultipleTextOutputFormat abstract class which lets you implement the generateFileNameForKeyValue(). (New hadoop offers a different multiple output file mechanism: MultipleOutputs which is great if you have predefined output locations, did not get how to have them on the fly from key-value)

example code:

\*
Run example:
hadoop jar DynamicSplit.jar DynamicEventSplit.DynamicEventSplitMultifileMapReduce /event/US/incoming/2013-01-01-01/ event US 2013-01-01-01 2 "[a-zA-Z0-9_ ]+" "/event/dynamicsplit1/" ","
*/
package DynamicEventSplit;

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.lib.*;
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;

public class DynamicEventSplitMultifileMapReduce
{
        static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>  
        {
            private String event_name;
            private String EventNameRegexp;
            private int EventNameColumnNumber;
            private String columndelimeter=",";

            public void configure(JobConf job)
            {
                EventNameRegexp=job.get("EventNameRegexp");
                EventNameColumnNumber=Integer.parseInt(job.get("EventNameColumnNumber"));
                columndelimeter=job.get("columndelimeter");
            }
            public void map(LongWritable key, Text value,OutputCollector<Text, Text> output, Reporter reporter) throws IOException 
            {
                //check that expected event_name field exists  
                String [] dall=value.toString().split(columndelimeter);
                if (dall.length<EventNameColumnNumber)
                {
                    return;
                }
                event_name=dall[EventNameColumnNumber-1];
                //check that expected event_name is valid  
                if (!event_name.matches(EventNameRegexp))
                {
                    return;
                }
                output.collect(new Text(dall[1]),value);
            }
        }

        static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> 
        {
            public void reduce(Text key, Iterator<Text> values,OutputCollector<Text, Text> output, Reporter reporter) throws IOException 
            {
                    while (values.hasNext()) 
                    {
                        output.collect(key, values.next());
                    }
            }
        }


        static class MultiFileOutput extends MultipleTextOutputFormat<Text, Text> 
        {
            private String event_name;
            private String site;
            private String event_date;
            private String year;
            private String month;
            private String day;
            private String hour;
            private String basepath;


            public RecordWriter<Text,Text> getRecordWriter(FileSystem fs, JobConf job,String name, Progressable arg3) throws IOException
            {
                RecordWriter<Text,Text> rw=super.getRecordWriter(fs, job, name, arg3);
                site=job.get("site");
                event_date=job.get("date");
                year=event_date.substring(0,4);
                month=event_date.substring(5,7);
                day=event_date.substring(8,10);
                hour=event_date.substring(11,13);
                basepath=job.get("basepath");
                return rw;
            }

            protected String generateFileNameForKeyValue(Text key, Text value,String leaf) 
            {
                event_name=key.toString();
                return basepath+"event="+event_name+"/site="+site+"/year="+year+"/month="+month+"/day="+day+"/hour="+hour+"/"+leaf;
            }

            protected Text generateActualKey(Text key, Text value) 
            {
                return null;
            }
        }

        public static void main(String[] args) throws Exception 
        {
                String InputFiles=args[0];
                String OutputDir=args[1];
                String SiteStr=args[2];
                String DateStr=args[3];
                String EventNameColumnNumber=args[4];
                String EventNameRegexp=args[5];
                String basepath=args[6];
                String columndelimeter=args[7];

                Configuration mycon=new Configuration();
                JobConf conf = new JobConf(mycon,DynamicEventSplitMultifileMapReduce.class);
                conf.set("site",SiteStr);
                conf.set("date",DateStr);

                conf.setOutputKeyClass(Text.class);
                conf.setMapOutputKeyClass(Text.class);
                conf.setOutputValueClass(Text.class);

                conf.setMapperClass(Map.class);
                conf.setReducerClass(Reduce.class);

                conf.setInputFormat(TextInputFormat.class);
                conf.setOutputFormat(MultiFileOutput.class);

                conf.setMapSpeculativeExecution(false);
                conf.setReduceSpeculativeExecution(false);

                FileInputFormat.setInputPaths(conf,InputFiles);
                FileOutputFormat.setOutputPath(conf,new Path("/"+OutputDir+SiteStr+DateStr+"/"));

                conf.set("EventNameColumnNumber",EventNameColumnNumber);
                conf.set("EventNameRegexp",EventNameRegexp);
                conf.set("basepath",basepath);
                conf.set("columndelimeter",columndelimeter);

                JobClient.runJob(conf);
        }
}

Best way to split log files

1 Answers1