Hadoop实现共同出现的单词(Word co

Hadoop实现共同出现的单词(Word co-occurrence)是指在一个句子中相邻的两个单词。每一个相邻的单词就是一个Co-Occurrence对。

Sample Input:

a b cc, c d d c
I Love U.
dd ee f g s sa dew ad da
So shaken as we are, so wan with care.
Find we a time for frighted peace to pant.
And breathe short-winded accents of new broil.
To be commenced in strands afar remote.
I Love U U love i.
i i i i

Sample Output:

a:b 1
a:time 1
a:we 1
accents:of 1
accents:short-winded 1
ad:da 1
ad:dew 1
afar:remote 1
afar:strands 1
and:breathe 1
are:so 1
are:we 1
as:shaken 1
as:we 1
b:cc 1
be:commenced 1
be:to 1
breathe:short-winded 1
broil:new 1
c:cc 1
c:d 2
care:with 1
commenced:in 1
d:d 1
dd:ee 1
dew:sa 1
ee:f 1
f:g 1
find:we 1
for:frighted 1
for:time 1
frighted:peace 1
g:s 1
i:i 3
i:love 3
in:strands 1
love:u 3
new:of 1
pant:to 1
peace:to 1
s:sa 1
shaken:so 1
so:wan 1
u:u 1
wan:with 1

Code:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;

public class CoOccurrence {


  public static class TextPair implements WritableComparable<TextPair> {
    private Text first;
    private Text second;
   
    public TextPair(){
     set(new Text(), new Text());
    }
    public TextPair(String left, String right) {
        set(new Text(left), new Text(right));
    }
    public TextPair(Text left, Text right) {
     set(left, right);
    }
   
    public void set(Text left, Text right){
     String l = left.toString();
     String r = right.toString();
     int cmp = l.compareTo(r);     
     if(cmp <= 0){
      this.first = left;
      this.second = right;
     }else{
      this.first = right;
      this.second = left;
     }
    }
   
    public Text getFirst() {
      return first;
    }
    public Text getSecond() {
      return second;
    }

@Override
    public void readFields(DataInput in) throws IOException {
      first.readFields(in);
      second.readFields(in);
    }
    @Override
    public void write(DataOutput out) throws IOException {
     first.write(out);
     second.write(out);
    }
    @Override
    public int hashCode() {
      return first.hashCode() * 163 + second.hashCode();//May be some trouble here. why 163? sometimes 157
    }
    @Override
    public boolean equals(Object o) {
      if (o instanceof TextPair) {
        TextPair tp = (TextPair) o;
        return first.equals(tp.first) && second.equals(tp.second);
      }
      return false;
    }
    @Override
    public String toString(){
     return first + ":" + second;
    }
    @Override
    public int compareTo(TextPair tp) {
     int cmp = first.compareTo(tp.first);
     if(cmp != 0)
      return cmp;
     return second.compareTo(tp.second);
    }

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:http://www.heiqu.com/96f9f507f1404d812256d80db80fcec8.html