import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; /** * Hadoop MapReduce Sample Driver. * <p/> * <pre> * #hadoop jar JAR_FILE CLASS -libjars LIB_JAR COMMAND_OPTS * </pre> * * @author Data Dynamics * @version 0.1 */ public class UnionDriver extends org.apache.hadoop.conf.Configured implements org.apache.hadoop.util.Tool { public static void main(String[] args) throws Exception { int res = ToolRunner.run(new UnionDriver(), args); System.exit(res); } public int run(String[] args) throws Exception { Job job = Job.getInstance(); parseArguments(args, job); job.setJarByClass(UnionDriver.class); // Mapper Class job.setMapperClass(UnionMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // Reducer Task job.setNumReduceTasks(0); // Run a Hadoop Job return job.waitForCompletion(true) ? 0 : 1; } private void parseArguments(String[] args, Job job) throws IOException { for (int i = 0; i < args.length; ++i) { if ("-input".equals(args[i])) { FileInputFormat.addInputPaths(job, args[++i]); } else if ("-output".equals(args[i])) { FileOutputFormat.setOutputPath(job, new Path(args[++i])); } } } }
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * HDFS의 지정한 경로의 작은 파일들을 큰 단위의 파일들로 묶는 Mapper. * 큰 단위로 묶는 경우 단위는 HDFS의 Block Size가 된다. * * @author Data Dynamics * @version 0.1 */ public class UnionMapper extends Mapper<LongWritable, Text, NullWritable, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 파일의 ROW를 그대로 다시 기록하면 Mapper는 Block 단위로 파일을 분할하여 기록한다. context.write(NullWritable.get(), value); } }