09 使用python完成詞頻統計

import sys

# input comes from stdin (standard input)

for line in sys.stdin:

# remove leading and trailing whitespace

line = line.strip()

# split the line into words

words = line.split()

# increase counters

for word in words:

# write the results to stdout (standard output);

# what we output here will be the input for the

# reduce step, i.e. the input for reducer.py

## tab-delimited; the trivial word count is 1

print ('%s\t%s' % (word, 1))





from operator import itemgetter

import sys

current_word = none

current_count = 0

word = none

# input comes from stdin

for line in sys.stdin:

# remove leading and trailing whitespace

line = line.strip()

word, count = line.split('\t', 1)

# convert count (currently a string) to int


count = int(count)

except valueerror:

# count was not a number, so silently

# ignore/discard this line


# this if-switch only works because hadoop sorts map output

# by key (here: word) before it is passed to the reducer

if current_word == word:

current_count += count


if current_word:

# write result to stdout

print ('%s\t%s' % (current_word, current_count))

current_count = count

current_word = word

# do not forget to output the last word if needed!

if current_word == word:

print ('%s\t%s' % (current_word, current_count))




aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc

aa bb cc dd aa cc cc dd

hdfs dfs -mkdir /data

hdfs dfs -put info.txt /data/info

$hadoop_home/bin/hadoop jar 


-input "/data/*"

-output "/out99"

-reducer "python reducer.py"

-file "/root/reducer.py"




