pycurl实现hadoop的客户端功能.docx

资源描述

pycurl实现hadoop的客户端功能.docx

《pycurl实现hadoop的客户端功能.docx》由会员分享，可在线阅读，更多相关《pycurl实现hadoop的客户端功能.docx（18页珍藏版）》请在冰豆网上搜索。

pycurl实现hadoop的客户端功能.docx

pycurl实现hadoop的客户端功能

目前在测试一个hadoop的功能，需要频繁的和hadoop打交道。

刚开始采用的python的subprocess模块来调用底层的hadoop提供的命令行工具实现的。

一，hadoop提供的命令行格式说明：

hadoopfs[cmd]具体的命令有:

hadoopfs[-fs][-conf]

[-D][-ls][-lsr][-du]

[-dus][-mv][-cp][-rm[-skipTrash]]

[-rmr[-skipTrash]][-put...][-copyFromLocal...]

[-moveFromLocal...][-get[-ignoreCrc][-crc]

[-getmerge[addnl]][-cat]

[-copyToLocal[-ignoreCrc][-crc]][-moveToLocal]

[-mkdir][-report][-setrep[-R][-w]]

[-touchz][-test-[ezd]][-stat[format]]

[-tail[-f]][-text]

[-chmod[-R]PATH...]

[-chown[-R][OWNER][:

[GROUP]]PATH...]

[-chgrp[-R]GROUPPATH...]

[-count[-q]]

[-help[cmd]]

从上面可以看出命令提供的功能还是挺强大的。

包括了文件和对目录的各种操作。

举个例子：

要列出hadoop的根目录下面的文件,具体命令如下：

#hadoopfs -lshdfs:

//192.168.0.112:

50081/

drwx---r-x -testtest 02013-03-0811:

20/static

drwx---r-x -testtest 02013-02-1915:

40/system

drwxrwxrwx - test test 02013-01-2218:

42/video

其他的命令功能就不一一介绍了，相信看帮组文档自己也可以看懂。

这样会有一个问题，每执行一个命令都会新生成一个jvm，对运行命令的机器造成很大的负担，在命令多的情况下，查看top可以看到java的进程会跑到99%，严重影响到的使用。

于是有了下面的实现方法。

二，hadoop提供的web方式

在网上查看官方的客户端API，发现hadoop提供一个webRESTAPI，既采用curl的方式可以轻松实现。

官方文档连接为：

http:

//hadoop.apache.org/docs/stable/webhdfs.html

上面对使用方式进行充分的说明。

curl的方式可以进行对hadoop中的文件和目录进行一些基本的操作。

目前官网上提供的有

1,创建并写入文件

2，追加文件

3，打开并读入文件

4，创建目录

5，重命名文件或者目录

6，删除文件或者目录

7，列出文件或者目录状态

8，列出目录列表

下面提供一些具体的使用例子：

a，列出目录的状态

#curl-ihttp:

//192.168.0.112:

50071/webhdfs/v1/?

op=GETFILESTATUS

HTTP/1.1200OK

Content-Type:

application/json

Transfer-Encoding:

chunked

Server:

Jetty（6.1.26）

{"FileStatus":

{"accessTime":

0,"blockSize":

0,"group":

"TEST","length":

0,"modificationTime":

1362812718704,"owner":

"TEST","pathSuffix":

"","permission":

"705","replication":

0,"type":

"DIRECTORY"}}

b，重命名目录

#curl-i-XPUThttp:

//192.168.0.112:

50071/webhdfs/v1/test?

op=RENAME&destination=/test1

HTTP/1.1200OK

Content-Type:

application/json

Transfer-Encoding:

chunked

{"boolean":

true}

其他的功能就不一一介绍了。

具体的实现方式请看官方文档

三，由curl的方式想到的

因为我的程序是用python跑的，那么采用curl命令行的方式同样是调用底层命令，python的模块那么多，那么我如果使用python的curl库那不是可以轻松实现python对hadoop中文件和目录的操作。

在经过查资料后，写了一个基本的webhadoop的class，基本的功能大概完成了，其他的东西以后再加吧。

具体的代码如下：

1.#!

/usr/bin/env python

2.# -*- encoding:

utf-8 -*-

3."""A library to access Hadoop HTTP REST API,

4. make sure you hadoop cluster open the http access .

5."""

6.'''

7.author :

liran

8.data :

2013-03-11

10.致谢：

xwu

11. 武汉云雅科技有限公司

12.

13.'''

14.import StringIO

15.import pycurl

16.import re

17.import sys

18.import logging

19.import os

20.

21.class WebHadoop（object）:

22. def __init__（self,host,port,username,logger,prefix="/webhdfs/v1"）:

23. self.host = host

24. self.port = port

25. self.user = username

26. self.logger = logger

27. self.prefix = prefix

28. self.status = None

29. self.url = "http:

//%s:

%s" % （host,port）

30. selfself.url_path = self.url + self.prefix

31.

32.

33.

34. def checklink（self）:

35. try:

36. b = StringIO.StringIO（）

37. c = pycurl.Curl（）

38. checkurl = self.url + "/dfsnodelist.jsp?

whatNodes=LIVE"

39. c.setopt（pycurl.URL, checkurl）

40. c.setopt（pycurl.HTTPHEADER, ["Accept:

"]）

41. c.setopt（pycurl.WRITEFUNCTION, b.write）

42. c.setopt（pycurl.FOLLOWLOCATION, 1）

43. c.setopt（pycurl.MAXREDIRS, 5）

44. c.perform（）

45. self.status = c.getinfo（c.HTTP_CODE）

46. bbody = b.getvalue（）

47. self.Write_Debug_Log（self.status,checkurl）

48. p = pile（r'''Live Datanodes :

（.*）

49. results = p.findall（body）

50. b.close（）

51. if results[0] == "0":

52. self.logger.error（"Sorry, There are not live datanodes in Hadoop Cluster!

"）

53. self.curlObj.close（）

54. sys.exit（255）

55. return results[0]

56. except pycurl.error,e:

57. self.logger.error（"Sorry, can not get the hadoop http link .Erros:

%s" % e）

58. c.close（）

59. b.close（）

60. sys.exit（255）

61. finally:

62. c.close（）

63. b.close（）

64.

65.

66. def lsdir（self,path）:

67. try:

68. b = StringIO.StringIO（）

69. put_str = '[{"op":

LISTSTATUS}]'

70.

71. c = pycurl.Curl（）

72.

73. lsdir_url = self.url_path + path + "?

op=LISTSTATUS"

74. c.setopt（pycurl.URL, lsdir_url）

75. c.setopt（pycurl.HTTPHEADER, ["Accept:

"]）

76. c.setopt（pycurl.WRITEFUNCTION, b.write）

77. c.setopt（pycurl.FOLLOWLOCATION, 1）

78. c.setopt（pycurl.MAXREDIRS, 5）

79. c.perform（）

80. bbody = b.getvalue（）

81. self.status = c.getinfo（c.HTTP_CODE）

82. except Exception,e:

83. print e

84. finally:

85. c.close（）

86. b.close（）

87.

88.

89. if self.status == 200:

90. data_dir = eval（body）

91. return data_dir['FileStatuses']['FileStatus']

92.

93. else:

94. self.logger.error（"Sorry,can not list the dir or file status!

"）

95. self.Write_Debug_Log（self.status,lsdir_url）

96. return False

97.

98.

99. def lsfile（self,path）:

100. try:

101. c = pycurl.Curl（）

102. b = StringIO.StringIO（）

103. put_str = '[{"op":

LISTSTATUS}]'

104. lsdir_url = self.url_path + path + "?

op=GETFILESTATUS"

105. c.setopt（pycurl.URL, lsdir_url）

106. c.setopt（pycurl.HTTPHEADER, ["Accept:

"]）

107. c.setopt（pycurl.WRITEFUNCTION, b.write）

108. c.setopt（pycurl.FOLLOWLOCATION, 1）

109. c.setopt（pycurl.MAXREDIRS, 5）

110. c.perform（）

111. bbody = b.getvalue（）

112. self.status = c.getinfo（c.HTTP_CODE）

113. except Exception,e:

114. print e

115. finally:

116. c.close（）

117. b.close（）

118.

119. if self.status == 200:

120. data_dir = eval（body）

121. if data_dir['FileStatus']['type'] == "DIRECTORY":

122. self.logger.error（"Sorry,this file %s is a dir actually!

" % （path））

123. return False

124. else:

125. return data_dir['FileStatus']

126. else:

127. self.logger.error（"Sorry,can not list the dir or file status!

"）

128. self.Write_Debug_Log（self.status,lsdir_url）

129. return False

130.

131. def mkdir（self,path,permission="755"）:

132. try:

133. print "yes ,mkdir function"

134. b = StringIO.StringIO（）

135. c = pycurl.Curl（）

136. mkdir_str = '[{"op":

"MKDIRS","permission"=permission}]'

137. mkdir_url = "%s%s?

op=MKDIRS&permission=%s" % （self.url_path,path,permission）

138. c.setopt（pycurl.URL, mkdir_url）

139. c.setopt（pycurl.HTTPHEADER,['Content-Type:

application/json','Content-Length:

'+str（len（mkdir_str））]）

140. c.setopt（pycurl.CUSTOMREQUEST,"PUT"）

141. c.setopt（pycurl.POSTFIELDS,mkdir_str）

142.

143. c.setopt（pycurl.WRITEFUNCTION, b.write）

144. c.setopt（pycurl.FOLLOWLOCATION, 1）

145. c.setopt（pycurl.MAXREDIRS, 5）

146. c.perform（）

147. self.status = c.getinfo（c.HTTP_CODE）

148. bbody = b.getvalue（）

149. b.close（）

150. except Exception,e:

151. print e

152. finally:

153. c.close（）

154.

155.

156. if self.status == 200 :

157. if "true" in body:

158. self.logger.info（"Great,Successfully Create dir %s in hadoop cluster!

" % （path））

159. return True

160. elif "false" in body:

161. self.logger.info（"Sorry,can't create this %s dir in hadoop cluster!

"）

162. return False

163. else:

164. return False

165. else:

166. self.logger.error（"Sorry,can't create this %s dir in hadoop cluster!

1" % （path））

167. self.Write_Debug_Log（self.status,mkdir_url）

168.

169.

170. def remove（self,path,recursive="True"）:

171. try:

172. c = pycurl.Curl（）

173. b = StringIO.StringIO（）

174. remove_str = '[{"op":

"DELETE","recursive"=recursive}]'

175. remvoe_url = "%s%s?

op=DELETE&recursive=%s" % （self.url_path,path,recursive）

176. c.setopt（pycurl.URL, remvoe_url）

177. c.setopt（pycurl.HTTPHEADER,['Content-Type:

application/json','Content-Length:

'+str（len（remove_str））]）

178. c.setopt（pycurl.CUSTOMREQUEST,"DELETE"）

179. c.setopt（pycurl.POSTFIELDS,remove_str）

180. c.setopt（pycurl.WRITEFUNCTION, b.write）

181. c.setopt（pycurl.FOLLOWLOCATION,

展开阅读全文