I have 500GB data on HDFS to transfer to Cassandra cluster. I think the fastest way is to use Cassandra sstableloader
to bulk load sstable
files into Cassandra.
Cassandra 3.x has provided Client API CQLSSTableWriter
to generate sstable
files, which seems suitable for a single machine, but it is slow; how can I generate sstables
by using map-reduce?
Hbase has provided tools to generate Hbase table formatted files by map-reduce from hdfs data, before bulkloaded to hbase. Does Cassandra have a similar method?
************ added on 2017/02/21 after tried a lot ******** With some hints from answers, I search with some keywords and combine some materials together to form a test program below, but seems missing some configuration or logics ( Error: org.apache.cassandra.exceptions.ConfigurationException: Expecting URI in variable: [cassandra.config]) when starting reduce period. I think even if I solve this problem, new problems are in front. So If some one can provide me a complete sample of CqlBulkOutpuFormat in hadoop MR, because I was guessing something before.
I think generating sstable files in HDFS by hadoop mr doesn't need the information of a certaion Cassandra cluster, just shema and insert format is ok. What's more, reading hdfs files and generating SSTABLE files doesn't seem to need both map and reduce methods. This is what I doubt on some information on Internet.
here is my codes, maybe many mistakes in your eyes, just show I have tried.
package hadooptest;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.cql3.CqlBulkOutputFormat;
import org.apache.directory.api.util.ByteBuffer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class CassandraBulkImporter extends Configured implements Tool{
private static final String CASSANDRA_KEYSPACE_NAME = "yanbo";
private static final String CASSANDRA_TABLE_NAME = "test";
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new
CassandraBulkImporter(), args);
System.exit(exitCode);
}
public int run(String[] args) throws Exception {
//配置信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "CassandraBulkImporter");
job.setJobName(CassandraBulkImporter.class.getName());
job.setJarByClass(CassandraBulkImporter.class);
job.setOutputFormatClass(CqlBulkOutputFormat.class);
// 1.2对输入数据进行格式化处理的类
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(HdfsMapper.class);
job.setReducerClass(ReducerToCassandra.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
ConfigHelper.setOutputInitialAddress(job.getConfiguration(),"10.149.11.15");
ConfigHelper.setOutputPartitioner(job.getConfiguration(),"Murmur3Partitioner");
ConfigHelper.setOutputRpcPort(job.getConfiguration(), "9160");
ConfigHelper.setOutputKeyspace(job.getConfiguration(),CASSANDRA_KEYSPACE_NAME);
ConfigHelper.setOutputColumnFamily(
job.getConfiguration(),
CASSANDRA_KEYSPACE_NAME,
CASSANDRA_TABLE_NAME
);
//Set the properties for CqlBulkOutputFormat
String KEYSPACE = "quote";
String TABLE = "user_audience";
String SCHEMA = String.format("CREATE TABLE %s.%s (" +
"id ascii, " +
"audience_ids ascii, " +
"PRIMARY KEY (id) " +
")", KEYSPACE, TABLE);
String INSERT_STMT = String.format("INSERT INTO %s.%s (" +
"id, audience_ids" +
") VALUES (" +
"?, ?" +
")", KEYSPACE, TABLE);
MultipleOutputs.addNamedOutput(job,
CASSANDRA_TABLE_NAME,
CqlBulkOutputFormat.class, Object.class, List.class);
CqlBulkOutputFormat.setTableSchema(
job.getConfiguration(), CASSANDRA_TABLE_NAME,
SCHEMA);
CqlBulkOutputFormat.setTableInsertStatement(
job.getConfiguration(),
CASSANDRA_TABLE_NAME, INSERT_STMT);
return job.waitForCompletion(true) ? 0 : 1;
}
static class HdfsMapper extends
Mapper<LongWritable, Text, LongWritable, Text> {
public HdfsMapper(){}
@Override
public void map(LongWritable key, Text value,
Context context) throws IOException, InterruptedException {
String s = value.toString();
context.write(key, value);
}
}
static class ReducerToCassandra extends
Reducer<LongWritable, Text, Object, List<ByteBuffer>> {
public ReducerToCassandra(){}
private MultipleOutputs multipleOutputs;
@SuppressWarnings("unchecked")
protected void setup(Context context)
throws IOException, InterruptedException {
multipleOutputs = new MultipleOutputs(context);
}
@Override
public void reduce(LongWritable id,
Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for(Text value : values )
{
List<ByteBuffer> bVariables =
new ArrayList<ByteBuffer> ();
for(String cell: value.toString().split("\001"))
{
ByteBuffer buf = new ByteBuffer();
buf.append(cell.getBytes());
bVariables.add( buf);
}
multipleOutputs.write(CASSANDRA_TABLE_NAME,
null, bVariables);
}
}
}
}
17/02/20 21:14:25 INFO mapreduce.Job: map 100% reduce 0% 17/02/20 21:14:35 INFO mapreduce.Job: Task Id : attempt_1487380305027_71639_r_000000_0, Status : FAILED Error: org.apache.cassandra.exceptions.ConfigurationException: Expecting URI in variable: [cassandra.config]. Please prefix the file with file:/// for local files or file:/// for remote files. Aborting. If you are executing this from an external tool, it needs to set Config.setClientMode(true) to avoid loading configuration. at org.apache.cassandra.config.YamlConfigurationLoader.getStorageConfigURL(YamlConfigurationLoader.java:73) at org.apache.cassandra.config.YamlConfigurationLoader.loadConfig(YamlConfigurationLoader.java:85) at org.apache.cassandra.config.DatabaseDescriptor.loadConfig(DatabaseDescriptor.java:135) at org.apache.cassandra.config.DatabaseDescriptor.(DatabaseDescriptor.java:119) at org.apache.cassandra.hadoop.cql3.CqlBulkRecordWriter.(CqlBulkRecordWriter.java:110) at org.apache.cassandra.hadoop.cql3.CqlBulkRecordWriter.(CqlBulkRecordWriter.java:94) at org.apache.cassandra.hadoop.cql3.CqlBulkOutputFormat.getRecordWriter(CqlBulkOutputFormat.java:81) at org.apache.cassandra.hadoop.cql3.CqlBulkOutputFormat.getRecordWriter(CqlBulkOutputFormat.java:55) at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.(ReduceTask.java:540) at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:614) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)