Hadoop 文件输入和文件输出学步园.docx

上传人:b****1 文档编号:23274118 上传时间:2023-05-15 格式:DOCX 页数:16 大小:18.73KB
下载 相关 举报
Hadoop 文件输入和文件输出学步园.docx_第1页
第1页 / 共16页
Hadoop 文件输入和文件输出学步园.docx_第2页
第2页 / 共16页
Hadoop 文件输入和文件输出学步园.docx_第3页
第3页 / 共16页
Hadoop 文件输入和文件输出学步园.docx_第4页
第4页 / 共16页
Hadoop 文件输入和文件输出学步园.docx_第5页
第5页 / 共16页
点击查看更多>>
下载资源
资源描述

Hadoop 文件输入和文件输出学步园.docx

《Hadoop 文件输入和文件输出学步园.docx》由会员分享,可在线阅读,更多相关《Hadoop 文件输入和文件输出学步园.docx(16页珍藏版)》请在冰豆网上搜索。

Hadoop 文件输入和文件输出学步园.docx

Hadoop文件输入和文件输出学步园

Hadoop文件输入和文件输出学步园

本文完成对hadoop输入、输出文件方式的控制,完成的功能如下:

1、改写map读取数据的格式:

默认的----------->变为

2、改写输出的格式,输出文件时每个输入文件对应一个输出文件,输出文件的名字跟输入文件名字相同。

直接上代码:

coAuInputFormat

packagean.hadoop.code.audit;/**

*Thefunctionofthisclassisrevisetheinputformat

*the--->map

*ofthemap

**/importjava.io.IOException;importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.fs.FileSystem;

importorg.apache.hadoop.fs.Path;

importorg.apache.hadoop.io.Text;

importpress.CompressionCodec;

importpress.CompressionCodecFactory;

importorg.apache.hadoop.mapred.FileSplit;

importorg.apache.hadoop.mapreduce.InputSplit;

importorg.apache.hadoop.mapreduce.RecordReader;

importorg.apache.hadoop.mapreduce.TaskAttemptContext;

importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;publicclasscoAuInputFormatextendsFileInputFormat{

privateCompressionCodecFactorycompressionCodecs=null;

publicvoidconfigure(Configurationconf){

compressionCodecs=newCompressionCodecFactory(conf);

}

/**

*@briefisSplitable不对文件进行切分,必须对文件整体进行处理

*

*@paramfs

*@paramfile

*

*@returnfalse

*/

protectedbooleanisSplitable(FileSystemfs,Pathfile){

CompressionCodeccodec=compressionCodecs.getCodec(file);

returnfalse;//以文件为单位,每个单位作为一个split,即使单个文件的大小超过了64M,也就是Hadoop一个块得大小,也不进行分片

}@Override

publicRecordReadercreateRecordReader(InputSplitsplit,

TaskAttemptContextcontext)throwsIOException,

InterruptedException{

//TODOAuto-generatedmethodstub

returnnewcoAuRecordReader(context,split);

}}

coAuRecordReader

packagean.hadoop.code.audit;importjava.io.IOException;importmons.logging.Log;

importmons.logging.LogFactory;

importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.fs.FSDataInputStream;

importorg.apache.hadoop.fs.FileSystem;

importorg.apache.hadoop.fs.Path;

importorg.apache.hadoop.io.Text;

importpress.CompressionCodec;

importpress.CompressionCodecFactory;importorg.apache.hadoop.mapreduce.InputSplit;

importorg.apache.hadoop.mapreduce.RecordReader;

importorg.apache.hadoop.mapreduce.TaskAttemptContext;

importorg.apache.hadoop.mapreduce.lib.input.FileSplit;publicclasscoAuRecordReaderextendsRecordReader{

privatestaticfinalLogLOG=LogFactory.getLog(coAuRecordReader.class.getName());

privateCompressionCodecFactorycompressionCodecs=null;

privatelongstart;

privatelongpos;

privatelongend;

privatebyte[]buffer;

privateStringkeyName;

privateFSDataInputStreamfileIn;

privateTextkey=null;

privateTextvalue=null;publiccoAuRecordReader(TaskAttemptContextcontext,InputSplitgenericSplit)throwsIOException{

//TODOAuto-generatedconstructorstub

Configurationjob=context.getConfiguration();

FileSplitsplit=(FileSplit)genericSplit;

start=((FileSplit)split).getStart();//从中可以看出每个文件是作为一个split的

end=split.getLength()+start;

finalPathpath=split.getPath();//

keyName=path.toString();//key的值是文件路径

LOG.info("filenameinhdfsis:

"+keyName);//写入日志文件,去哪里查看日志呢?

finalFileSystemfs=path.getFileSystem(job);

fileIn=fs.open(path);

fileIn.seek(start);

buffer=newbyte[(int)(end-start)];

this.pos=start;

/*if(key==null){

key=newText();

key.set(keyName);

}

if(value==null){

value=newText();

value.set(utf8);

}*/

}//coAuRecordReader()@Override

publicvoidinitialize(InputSplitgenericSplit,TaskAttemptContextcontext)

throwsIOException,InterruptedException{

//TODOAuto-generatedmethodstub

FileSplitsplit=(FileSplit)genericSplit;

Configurationjob=context.getConfiguration();

//this.maxLineLength=job.getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);

start=split.getStart();

end=start+split.getLength();

finalPathfile=split.getPath();

compressionCodecs=newCompressionCodecFactory(job);

finalCompressionCodeccodec=compressionCodecs.getCodec(file);

keyName=file.toString();//key的值是文件路径

LOG.info("filenameinhdfsis:

"+keyName);//写入日志文件,去哪里查看日志呢?

finalFileSystemfs=file.getFileSystem(job);

fileIn=fs.open(file);

fileIn.seek(start);

buffer=newbyte[(int)(end-start)];

this.pos=start;

}@Override

publicbooleannextKeyValue()throwsIOException,InterruptedException{

//TODOAuto-generatedmethodstub

//这个是需要做的

if(key==null){

key=newText();

}

key.set(keyName);

if(value==null){

value=newText();

}

key.clear();

key.set(keyName);//setthekey

value.clear();//clearthevalue

while(pos

fileIn.readFully(pos,buffer);

value.set(buffer);

pos+=buffer.length;

LOG.info("endis:

"+end+"posis:

"+pos);

returntrue;

}

returnfalse;

}@Override

publicTextgetCurrentKey()throwsIOException,InterruptedException{

//TODOAuto-generatedmethodstub

returnkey;

}@Override

publicTextgetCurrentValue()throwsIOException,InterruptedException{

//TODOAuto-generatedmethodstub

returnvalue;

}@Override

publicfloatgetProgress()throwsIOException,InterruptedException{

//TODOAuto-generatedmethodstub

if(start==end){

return0.0f;

}else{

returnMath.min(1.0f,(pos-start)/(float)(end-start));

}

}@Override

publicvoidclose()throwsIOException{

//TODOAuto-generatedmethodstub

if(fileIn!

=null){

fileIn.close();

}

}}

coAuOutputFormatpackagean.hadoop.code.audit;/**

*thenameoftheoutputfilename

*

**/importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.io.IntWritable;

importorg.apache.hadoop.io.Text;publicclasscoAuOutputFormatextendsMultipleOutputFormat{

privatefinalstaticStringsuffix="_its4";

@Override

protectedStringgenerateFileNameForKeyValue(Textkey,Textvalue,

Configurationconf){

//TODOAuto-generatedmethodstub

Stringpath=key.toString();//文件的路径及名字

String[]dir=path.split("/");

intlength=dir.length;

Stringfilename=dir[length-1];

returnfilename+suffix;//输出的文件名,输出的文件名

}

}

MultipleOutputFormatpackagean.hadoop.code.audit;/**

*themutiply

**/importjava.io.DataOutputStream;

importjava.io.IOException;

importjava.util.HashMap;

importjava.util.Iterator;

importorg.apache.hadoop.conf.Configuration;

importorg.apache.hadoop.fs.FSDataOutputStream;

importorg.apache.hadoop.fs.Path;

importorg.apache.hadoop.io.Writable;

importorg.apache.hadoop.io.WritableComparable;

importpress.CompressionCodec;

importpress.GzipCodec;

importorg.apache.hadoop.mapreduce.OutputCommitter;

importorg.apache.hadoop.mapreduce.RecordWriter;

importorg.apache.hadoop.mapreduce.TaskAttemptContext;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;

importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

importorg.apache.hadoop.util.ReflectionUtils;

publicabstractclassMultipleOutputFormat,VextendsWritable>

extendsFileOutputFormat{//默认的是TextOutputFormat

privateMultiRecordWriterwriter=null;

publicRecordWritergetRecordWriter(TaskAttemptContextjob)throwsIOException,

InterruptedException{

if(writer==null){

writer=newMultiRecordWriter(job,getTaskOutputPath(job));//job,outputpath

}

returnwriter;

}

privatePathgetTaskOutputPath(TaskAttemptContextconf)throwsIOException{//获得输出路径

PathworkPath=null;

OutputCommittercommitter=super.getOutputCommitter(conf);

if(committerinstanceofFileOutputCommitter){//如果是

workPath=((FileOutputCommitter)committer).getWorkPath();//工作路径

}else{

PathoutputPath=super.getOutputPath(conf);//获得conf路径

if(outputPath==null){

thrownewIOException("Undefinedjoboutput-path");

}

workPath=outputPath;

}

returnworkPath;//

}

/**通过key,value,conf来确定输出文件名(含扩展名)*/

protectedabstractStringgenerateFileNameForKeyValue(Kkey,Vvalue,Configurationconf);//抽象方法,被之后的方法重写了

publicclassMultiRecordWriterextendsRecordWriter{

/**RecordWriter的缓存*/

privateHashMap>recordWriters=null;

privateTaskAttemptContextjob=null;

/**输出目录*/

privatePathworkPath=null;

publicMultiRecordWriter(TaskAttemptContextjob,PathworkPath){//构造函数

super();

this.job=job;

this.workPath=workPath;

recordWriters=newHashMap>();

}

@Override

publicvoidclose(TaskAttemptContextcontext)throwsIOException,InterruptedException{//多个writer都要关掉

Iterator>values=this.recordWriters.values().iterator();

while(values.hasNext()){

values.next().close(context);

}

this.recordWriters.clear();

}

@Override

publicvoidwrite(Kkey,Vvalue)throwsIOException,InterruptedException{

//得到输出文件名

StringbaseName=generateFileNameForKeyValue(key,value,job.getConfiguration());//生成输出文件名

RecordWriterrw=this.recordWriters.get(baseName);//?

if(rw==null){

rw=getBaseRecordWriter(job,baseName);//

this.recordWriters.put(baseName,rw);

}

rw.write(key,value);

}

//${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}

privateRecordWritergetBaseRecordWriter(TaskAttemptContextjob,StringbaseName)

throwsIOException,InterruptedException{

Configurationconf=job.getConfiguration();

booleanisCompressed=getCompressOutput(job);

StringkeyValueSeparator=",";

RecordWriterrecordWriter=null;

if(isCompressed){

ClasscodecClass=getOutputCompressorClass(job,

GzipCodec.class);

CompressionCodeccodec=ReflectionUtils.newInstance(codecClass,conf);

Pathfile=newPath(workPath,base

展开阅读全文
相关资源
猜你喜欢
相关搜索

当前位置:首页 > 自然科学 > 物理

copyright@ 2008-2022 冰豆网网站版权所有

经营许可证编号:鄂ICP备2022015515号-1