# Copyright 2009-2010 Yelp # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """An implementation of wc as an MRJob. This is meant as an example of why mapper_final is useful.""" from mrjob.job import MRJob class MRWordCountUtility(MRJob): def __init__(self, *args, **kwargs): super(MRWordCountUtility, self).__init__(*args, **kwargs) self.chars = 0 self.words = 0 self.lines = 0 def mapper(self, _, line): # Don't actually yield anything for each line. Instead, collect them # and yield the sums when all lines have been processed. The results # will be collected by the reducer. self.chars += len(line) + 1 # +1 for newline self.words += sum(1 for word in line.split() if word.strip()) self.lines += 1 def mapper_final(self): yield('chars', self.chars) yield('words', self.words) yield('lines', self.lines) def reducer(self, key, values): yield(key, sum(values)) if __name__ == '__main__': MRWordCountUtility.run()