学习hive源码.docx

上传人:b****7 文档编号:23724960 上传时间:2023-05-20 格式:DOCX 页数:42 大小:457.34KB
下载 相关 举报
学习hive源码.docx_第1页
第1页 / 共42页
学习hive源码.docx_第2页
第2页 / 共42页
学习hive源码.docx_第3页
第3页 / 共42页
学习hive源码.docx_第4页
第4页 / 共42页
学习hive源码.docx_第5页
第5页 / 共42页
点击查看更多>>
下载资源
资源描述

学习hive源码.docx

《学习hive源码.docx》由会员分享,可在线阅读,更多相关《学习hive源码.docx(42页珍藏版)》请在冰豆网上搜索。

学习hive源码.docx

学习hive源码

学习Hive

李建奇

1学习

看了一部分代码,感觉,hive比较复杂,使用场景有限,一般用hadoop原生的mapreduce就可以了。

1.1

1.2版本

0.6

1.3目的

学习facebook等应用hive的经验,以便应用于公司。

学习代码的目的是便于更好的应用,比如debuging,tuning.以及应用新的patch.等。

2Pig+Hive:

ETL+datawarehouse

ThedatapreparationphaseisoftenknownasETL(ExtractTransformLoad)orthedatafactory."Factory"isagoodanalogybecauseitcapturestheessenceofwhatisbeingdoneinthisstage:

Justasaphysicalfactorybringsinrawmaterialsandoutputsproductsreadyforconsumers,soadatafactorybringsinrawdataandproducesdatasetsreadyfordatauserstoconsume.Rawdataisloadedin,cleanedup,conformedtotheselecteddatamodel,joinedwithotherdatasources,andsoon.Usersinthisphasearegenerallyengineers,dataspecialists,orresearchers.

Thedatapresentationphaseisusuallyreferredtoasthedatawarehouse.Awarehousestoresproductsreadyforconsumers;theyneedonlycomeandselecttheproperproductsoffoftheshelves.Inthisphase,usersmaybeengineersusingthedatafortheirsystems,analysts,ordecisionmakers.

Giventhedifferentworkloadsanddifferentusersforeachphase,wehavefoundthatdifferenttoolsworkbestineachphase.Pig(combinedwithaworkflowsystemsuchasOozie)isbestsuitedforthedatafactory,andHiveforthedatawarehouse.

 

2.1datawarehouse

Datawarehouseusecases

Inthedatawarehousephaseofprocessing,weseetwodominantusecases:

business-intelligenceanalysisandad-hocqueries.

Inthefirstcase,usersconnectthedatatobusinessintelligence(BI)tools—suchasMicroStrategy—togeneratereportsordofurtheranalysis.

Inthesecondcase,usersrunad-hocqueriesissuedbydataanalystsordecisionmakers.

Inbothcases,therelationalmodelandSQLarethebestfit.Indeed,datawarehousinghasbeenoneofthecoreusecasesforSQLthroughmuchofitshistory.Ithastherightconstructstosupportthetypesofqueriesandtoolsthatanalystswanttouse.Anditisalreadyinusebyboththetoolsandusersinthefield.

2.2facebook的应用架构

3hive

3.1Architecuture

3.2QueryTranslation

SELECTurl,count(*)FROMpage_viewsGROUPBYurl

3.3SerDe

3.4Table存储结构

4Ql

我从plan开始分析,是因为,我觉得plan应该是这个系统的核心。

就是说sql->plan->execute。

4.1plan

4.1.1从UT的角度

publicclassTestPlanextendsTestCase{

finalStringF1="#affiliations";

finalStringF2="friends[0].friendid";

try{

//initializeacompletemapreduceconfiguration

ExprNodeDescexpr1=newExprNodeColumnDesc(

TypeInfoFactory.stringTypeInfo,F1,"",false);

ExprNodeDescexpr2=newExprNodeColumnDesc(

TypeInfoFactory.stringTypeInfo,F2,"",false);

ExprNodeDescfilterExpr=TypeCheckProcFactory.DefaultExprProcessor

.getFuncExprNodeDesc("==",expr1,expr2);

FilterDescfilterCtx=newFilterDesc(filterExpr,false);

//一个filter类型的operator

Operatorop=OperatorFactory.get(FilterDesc.class);

op.setConf(filterCtx);

//定义了一个pathToAlias

ArrayListaliasList=newArrayList();

aliasList.add("a");

LinkedHashMap>pa=newLinkedHashMap>();

pa.put("/tmp/testfolder",aliasList);

//定义了一个pathToOperator

TableDesctblDesc=Utilities.defaultTd;

PartitionDescpartDesc=newPartitionDesc(tblDesc,null);

LinkedHashMappt=newLinkedHashMap();

pt.put("/tmp/testfolder",partDesc);

//定义了一个aliastoOperator

LinkedHashMap

extendsSerializable>>ao=

newLinkedHashMap

extendsSerializable>>();

ao.put("a",op);

MapredWorkmrwork=newMapredWork();

mrwork.setPathToAliases(pa);

mrwork.setPathToPartitionInfo(pt);

mrwork.setAliasToWork(ao);

}

我猜一个job是由input,output+MapRedWork构成的。

接下来,我看plan如何执行。

 

4.1.2MapredWork

//这个类是计划的核心部分

publicclassMapredWorkimplementsSerializable{

privatestaticfinallongserialVersionUID=1L;

privateStringcommand;

//mapsidework

//useLinkedHashMaptomakesuretheiterationorderis

//deterministic,toeasetesting

privateLinkedHashMap>pathToAliases;

privateLinkedHashMappathToPartitionInfo;

privateLinkedHashMap

extendsSerializable>>aliasToWork;

privateLinkedHashMapaliasToPartnInfo;

//map<->reduceinterface

//schemaofthemap-reduce'key'object-thisishomogeneous

privateTableDesckeyDesc;

//schemaofthemap-reduce'val'object-thisisheterogeneous

privateListtagToValueDesc;

privateOperator

>reducer;

privateIntegernumReduceTasks;

privateIntegernumMapTasks;

privateIntegerminSplitSize;

privatebooleanneedsTagging;

privatebooleanhadoopSupportsSplittable;

privateMapredLocalWorkmapLocalWork;

privateStringinputformat;

4.1.3Dviver

这是QL的主要接口

//分析

//Driver

compile(command)

{

ctx=newContext(conf);

ParseDriverpd=newParseDriver();

ASTNodetree=pd.parse(command,ctx);

tree=ParseUtils.findRootNonNullToken(tree);

BaseSemanticAnalyzersem=SemanticAnalyzerFactory.get(conf,tree);

//Dosemanticanalysisandplangeneration

sem.analyze(tree,ctx);

//validatetheplan

sem.validate();

plan=newQueryPlan(command,sem);

//initializeFetchTaskrighthere

if(plan.getFetchTask()!

=null){

plan.getFetchTask().initialize(conf,plan,null);

}

}

 

//lauchTask

//这个与hadoop中run一个task是类似的

TaskResulttskRes=newTaskResult();

TaskRunnertskRun=newTaskRunner(tsk,tskRes);

//启动一个线程,让task.execute

tskRun.start();

 

//物理执行

publicintexecute(){

plan.setStarted();

intjobs=countJobs(plan.getRootTasks());

//取出roottasks,

for(Task

extendsSerializable>tsk:

plan.getRootTasks()){

driverCxt.addToRunnable(tsk);

}

//依次执行

//Loopwhileyoueitherhavetasksrunning,ortasksqueuedup

while(running.size()!

=0||runnable.peek()!

=null){

//Launchuptomaxthreadstasks

while(runnable.peek()!

=null&&running.size()

Task

extendsSerializable>tsk=runnable.remove();

launchTask(tsk,queryId,noName,running,jobname,jobs,driverCxt);

}

//polltheTaskstoseewhichonecompleted

TaskResulttskRes=pollTasks(running.keySet());

TaskRunnertskRun=running.remove(tskRes);

Task

extendsSerializable>tsk=tskRun.getTask();

if(tsk.getChildTasks()!

=null){

for(Task

extendsSerializable>child:

tsk.getChildTasks()){

if(DriverContext.isLaunchable(child)){

driverCxt.addToRunnable(child);

}

}

}

}

}

 

4.1.4QueryPlan

//生成执行的图,Hive中最复杂的部分

/**

*Populateapi.QueryPlanfromexecstructures.Thisincludesconstructingthe

*dependencygraphsofstagesandoperators.

*

*@throwsIOException

*/

privatevoidpopulateQueryPlan()throwsIOException{

query.setStageGraph(neworg.apache.hadoop.hive.ql.plan.api.Graph());

query.getStageGraph().setNodeType(NodeType.STAGE);

 

}

看到这里,我觉得hive挺复杂,想要用好不容易。

如果,查询不是很多,写mapper,reducetask可能也没有什么问题。

4.1.5Task

4.1.6Operator相关

操作的类型包括:

JOIN(0),

MAPJOIN

(1),

EXTRACT

(2),

FILTER(3),

FORWARD(4),

GROUPBY(5),

LIMIT(6),

SCRIPT(7),

SELECT(8),

TABLESCAN(9),

FILESINK(10),

REDUCESINK(11),

UNION(12),

UDTF(13),

LATERALVIEWJOIN(14),

LATERALVIEWFORWARD(15),

HASHTABLESINK(16),

HASHTABLEDUMMY(17);

他们都被放到OperatorFactory类中,通过工厂模式进行管理。

opvec=newArrayList();

opvec.add(newOpTuple(FilterDesc.class,FilterOperator.class));

opvec.add(newOpTuple(SelectDesc.class,SelectOperator.class));

opvec.add(newOpTuple(ForwardDesc.class,ForwardOperator.class));

opvec.add(newOpTuple(FileSinkDesc.class,FileSinkOperator.class));

opvec.add(newOpTuple(CollectDesc.class,CollectOperator.class));

opvec.add(newOpTuple(ScriptDesc.class,ScriptOperator.class));

opvec.add(newOpTuple(ReduceSinkDesc.class,ReduceSinkOperator.class));

opvec.add(newOpTuple(ExtractDesc.class,ExtractOperator.class));

opvec.add(newOpTuple(GroupByDesc.class,GroupByOperator.class));

opvec.add(newOpTuple(JoinDesc.class,JoinOperator.class));

opvec.add(newOpTuple(MapJoinDesc.class,MapJoinOperator.class));

opvec.add(newOpTuple(SMBJoinDesc.class,SMBMapJoinOperator.class));

opvec.add(newOpTuple(LimitDesc.class,LimitOperator.class));

opvec.add(newOpTuple(TableScanDesc.class,TableScanOperator.class));

opvec.add(newOpTuple(UnionDesc.class,UnionOperator.class));

opvec.add(newOpTuple(UDTFDesc.class,UDTFOperator.class));

opvec.add(newOpTuple(LateralViewJoinDesc.class,

LateralViewJoinOperator.class));

opvec.add(newOpTuple(LateralViewForwardDesc.class,

LateralViewForwardOperator.class));

opvec.add(newOpTuple(HashTableDummyDesc.class,

HashTableDummyOperator.class));

opvec.add(newOpTuple(HashTableSinkDesc.class,

HashTableSinkOperator.class));

参考:

TestOperators.testScriptOperator

参考:

TestOperators.testMapOperator

4.1.6.1Operator图结构

sd是一个Operator

op是一个Operator

下列方法变成一个树

Operatorsop=OperatorFactory.getAndMakeChild(sd,op);

ScriptDescsd=newScriptDesc("cat",scriptOutput,

TextRecordWriter.class,scriptInput,

TextRecordReader.class,TextRecordReader.class,

PlanUtils.getDefaultTableDesc(""+Utilities.tabCode,"key"));

Operatorsop=OperatorFactory.getAndMakeChild(sd,op);

//Collectoperatortoobservetheoutputofthescript

CollectDesccd=newCollectDesc(Integer.valueOf(10));

CollectOperatorcdop=(CollectOperator)OperatorFac

展开阅读全文
相关资源
猜你喜欢
相关搜索

当前位置:首页 > 幼儿教育 > 幼儿读物

copyright@ 2008-2022 冰豆网网站版权所有

经营许可证编号:鄂ICP备2022015515号-1