hadoop - Mapreduce Custom TextOutputFormat - Strange characters NUL, SOH, etc -
i have implemented custom output format converting key value pairs json format.
public class jsonoutputformat extends textoutputformat<text, intwritable> { @override public recordwriter<text, intwritable> getrecordwriter(taskattemptcontext context) throws ioexception, interruptedexception { configuration conf = context.getconfiguration(); path path = getoutputpath(context); filesystem fs = path.getfilesystem(conf); fsdataoutputstream out = fs.create(new path(path,context.getjobname())); return new jsonrecordwriter(out); }
}
private static class jsonrecordwriter extends linerecordwriter<text,intwritable>{ boolean firstrecord = true; @override public synchronized void close(taskattemptcontext context) throws ioexception { out.writebytes("}"); super.close(context); } @override public synchronized void write(text key, intwritable value) throws ioexception { if (!firstrecord){ out.writebytes(",\r\n"); firstrecord = false; } out.writeutf(key.tostring() + ":" +value.tostring()); } public jsonrecordwriter(dataoutputstream out) throws ioexception{ super(out); out.writebytes("{"); } }
however, output mapreduce job has undesirable chars such as: {nul chair:12 nul bs book:1}
my driver class follows:
public class driver { public static class mymapper extends mapper<object, text, text, intwritable> { intwritable 1 = new intwritable(1); @override protected void map(object key, text value, context context) throws ioexception, interruptedexception { string[] words = value.tostring().split(" "); for(string word: words) context.write(new text(word), one); } } public static class myreducer extends reducer<text, intwritable, text, intwritable> { @override protected void reduce(text key, iterable<intwritable> values, context context) throws ioexception, interruptedexception { iterator = values.iterator(); int count = 0; while (it.hasnext()){ intwritable c = (intwritable) it.next(); count+=c.get(); } context.write(key, new intwritable(count)); } } public static void main(string[] args) throws ioexception, classnotfoundexception, interruptedexception { configuration configuration = new configuration(); job job = job.getinstance(configuration, "wordcountjson"); job.setjarbyclass(driver.class); fileinputformat.addinputpath(job, new path(args[0])); fileoutputformat.setoutputpath(job, new path(args[1])); job.setmapoutputkeyclass(text.class); job.setmapoutputvalueclass(intwritable.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); job.setmapperclass(mymapper.class); job.setreducerclass(myreducer.class); job.setoutputformatclass(jsonoutputformat.class); job.setnumreducetasks(1); system.exit(job.waitforcompletion(true)?0:1); }
}
any ideas why chars appearing in output?
Comments
Post a Comment