上面是根据给定的输入的数据进行执行的一个过程,其实结果已经输出到output-dir目录中,你可以进行查看,在目录G:\Hadoop-0.16.4\output-dir下面生成了两个文件:.part-00000.crc和part-00000。
查看处理结果,如下所示:
或者,直接到G:\hadoop-0.16.4\output-dir目录下面打开part-00000文件查看即可,内容如下所示:
actor 1
add 2
after 2
apache 1
append 1
as 6
background 1
be 2
believe 1
bench 3
block 1
cafe 2
cat 4
communications 1
connection 1
cust 1
同上面的是一样的。
这是一个非常简单的例子,而且Hadoop在其中实现了Google的MapReduce算法,用以处理数据。
我们可以简单看一下关于WordCount类的实现,在Hadoop的发行包中也附带了例子的源代码,WordCount.java类实现如下所示:
package org.apache.hadoop.examples; import java.io.IOException; import org.apache.hadoop.conf.Configuration; /** }
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; /**
* 这是一个Hadoop Map/Reduce应用的例子。
* 读取输入的文件,实现将文件的每一行分解成单个的单词并统计单词的出现频率。
* 输出结果是被分解的单词的列表及其词频。
* 使用如下命令可以运行: bin/hadoop jar build/hadoop-examples.jar wordcount
* [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i>
*/
public class WordCount extends Configured implements Tool {
/**
* MapClass是一个内部静态类。统计数据文件中每一行的单词。
*/
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
output.collect(word, one);
}
}
}
/**
* Reduce是一个内部静态类。作为统计单词数量的中间结果类,由于这个例子简单无须执行中间结果的合并。
*/
public static class Reduce extends MapReduceBase
implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
static int printUsage() { // 提示输入命令的用法
System.out.println("wordcount [-m <maps>] [-r <reduces>] <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
/**
* map/reduce程序的驱动部分,用于实现提交map/reduce任务。
*/
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), WordCount.class);
conf.setJobName("wordcount");
// the keys are words (strings)
conf.setOutputKeyClass(Text.class);
// the values are counts (ints)
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(MapClass.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
List<String> other_args = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
conf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if ("-r".equals(args[i])) {
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " +
args[i-1]);
return printUsage();
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
return printUsage();
}
conf.setInputPath(new Path(other_args.get(0)));
conf.setOutputPath(new Path(other_args.get(1)));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new WordCount(), args);
System.exit(res);
}
通过对着这个例子进行简单的说明,大致了解一下MapReduce算法的思想及其实现。