I have an hbase table whose key is a timestamp with one byte random prefix for distributing keys, so scanning is not an access point. I'm trying to expand TableInputFormatso that I can run one MapReduce in a table with a range, a prefix of all 256 possible prefixes, so that all ranges with the specified range of the timeline are scanned. My solution does not work, although, as always, scanning the last prefix (127) 256 times. Something should be common to all scans.
My code is below. Any ideas?
public class PrefixedTableInputFormat extends TableInputFormat {
@Override
public List<InputSplit> getSplits(JobContext context)
throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>();
Scan scan = getScan();
byte startRow[] = scan.getStartRow(), stopRow[] = scan.getStopRow();
byte prefixedStartRow[] = new byte[startRow.length+1];
byte prefixedStopRow[] = new byte[stopRow.length+1];
System.arraycopy(startRow, 0, prefixedStartRow, 1, startRow.length);
System.arraycopy(stopRow, 0, prefixedStopRow, 1, stopRow.length);
for (int prefix = -128; prefix < 128; prefix++) {
prefixedStartRow[0] = (byte) prefix;
prefixedStopRow[0] = (byte) prefix;
scan.setStartRow(prefixedStartRow);
scan.setStopRow(prefixedStopRow);
setScan(scan);
splits.addAll(super.getSplits(context));
}
return splits;
}
}
and
Configuration config = HBaseConfiguration.create();
Job job = new Job(config, "Aggregate");
job.setJarByClass(Aggregate.class);
Scan scan = new Scan();
scan.setStartRow("20120630".getBytes());
scan.setStopRow("20120701".getBytes());
scan.setCaching(500);
scan.setCacheBlocks(false);
TableMapReduceUtil.initTableMapperJob(
"event",
scan,
Mapper.class,
ImmutableBytesWritable.class,
ImmutableBytesWritable.class,
job,
true,
PrefixedTableInputFormat.class);
TableMapReduceUtil.initTableReducerJob("event", Reducer.class, job);
source
share