尊重原創著作權: https://www.gewuweb.com/hot/15800.html
手把手帶你原始碼決議HDFS檔案上傳之write上傳程序
尊重原創著作權: https://www.gewuweb.com/sitemap.html
HDFS的寫資料流程,如下圖所示:

HDFS上傳原始碼決議如下圖所示:

0)在pom.xml中增加如下依賴
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.1.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>3.1.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.30</version>
</dependency>
</dependencies>
** 一、向DataStreamer的佇列里面寫資料 **
用戶自己寫的代碼
@Test
public void testPut2() throws IOException {
FSDataOutputStream fos = fs.create(newPath("/input"));
fos.write("hello world".getBytes());
}
點擊write
FilterOutputStream.java
public void write(byte b[]) throws IOException{
write(b, 0, b.length);
}
public void write(byte b[], int off, int len)throws IOException {
if ((off | len | (b.length - (len + off)) |(off + len)) < 0)
throw new IndexOutOfBoundsException();
for (int i = 0 ; i < len ; i++) {
write(b[off + i]);
}
}
public void write(int b) throws IOException {
out.write(b);
}
點擊write
OutputStream.java
public abstract void write(int b) throws IOException;
ctrl + h 查找write實作類,選擇FSOutputSummer.java,在該類中查找write
FSOutputSummer.java
public synchronized void write(int b) throws IOException {
buf[count++] = (byte)b;
if(count == buf.length) {
flushBuffer();
}
}
protected synchronized void flushBuffer() throws IOException {
flushBuffer(false, true);
}
protected synchronized int flushBuffer(boolean keep,
boolean flushPartial) throws IOException {
int bufLen = count;
int partialLen = bufLen %sum.getBytesPerChecksum();
int lenToFlush = flushPartial ? bufLen :bufLen - partialLen;
if (lenToFlush != 0) {
// 向佇列中寫資料
// Directory=> File => Block(128M) => package(64K) => chunk(chunk 512byte + chunksum 4byte)
writeChecksumChunks(buf, 0, lenToFlush);
if (!flushPartial || keep) {
count = partialLen;
System.arraycopy(buf, bufLen - count,buf, 0, count);
} else {
count = 0;
}
}
// total bytes left minus unflushed bytesleft
return count - (bufLen - lenToFlush);
}
private void writeChecksumChunks(byte b[], int off, int len)
throws IOException {
// 計算chunk的校驗和
sum.calculateChunkedSums(b, off, len, checksum,0);
TraceScope scope = createWriteTraceScope();
// 按照chunk的大小遍歷資料
try {
for (int i = 0; i < len; i += sum.getBytesPerChecksum()) {
int chunkLen =Math.min(sum.getBytesPerChecksum(), len - i);
int ckOffset = i / sum.getBytesPerChecksum()* getChecksumSize();
// 一個chunk一個chunk的將資料寫入佇列
writeChunk(b, off + i,chunkLen, checksum, ckOffset,
getChecksumSize());
}
} finally {
if (scope != null) {
scope.close();
}
}
}
protected abstract void writeChunk(byte[] b, int bOffset, int bLen,
byte[] checksum, int checksumOffset, intchecksumLen) throws IOException;
ctrl + h 查找writeChunk實作類DFSOutputStream.java
protected synchronized void writeChunk(byte[] b, intoffset, int len,
byte[] checksum, int ckoff, int cklen)throws IOException {
writeChunkPrepare(len, ckoff, cklen);
// 往packet里面寫chunk的校驗和 4byte
currentPacket.writeChecksum(checksum, ckoff,cklen);
// 往packet里面寫一個chunk 512 byte
currentPacket.writeData(b, offset, len);
// 記錄寫入packet中的chunk個數,累計到127個chuck,這個packet就滿了
currentPacket.incNumChunks();
getStreamer().incBytesCurBlock(len);
// If packet is full, enqueue it fortransmission
if (currentPacket.getNumChunks() == currentPacket.getMaxChunks()||
getStreamer().getBytesCurBlock() ==blockSize) {
enqueueCurrentPacketFull();
}
}
synchronized void enqueueCurrentPacketFull() throws IOException {
LOG.debug("enqueue full {}, src=https://www.cnblogs.com/lihanlin/archive/2022/04/28/{},bytesCurBlock={}, blockSize={},"
+ " appendChunk={}, {}",currentPacket, src, getStreamer()
.getBytesCurBlock(), blockSize,getStreamer().getAppendChunk(),
getStreamer());
enqueueCurrentPacket();
adjustChunkBoundary();
endBlock();
}
void enqueueCurrentPacket() throws IOException {
getStreamer().waitAndQueuePacket(currentPacket);
currentPacket = null;
}
void waitAndQueuePacket(DFSPacket packet) throws IOException{
synchronized (dataQueue) {
try {
// 如果佇列滿了,等待
// If queue is full, then wait till wehave enough space
boolean firstWait = true;
try {
while (!streamerClosed &&dataQueue.size() + ackQueue.size() >
dfsClient.getConf().getWriteMaxPackets()) {
if (firstWait) {
Span span =Tracer.getCurrentSpan();
if (span != null) {
span.addTimelineAnnotation("dataQueue.wait");
}
firstWait = false;
}
try {
dataQueue.wait();
} catch (InterruptedException e) {
... ...
}
}
} finally {
Span span = Tracer.getCurrentSpan();
if ((span != null) && (!firstWait)){
span.addTimelineAnnotation("end.wait");
}
}
checkClosed();
// 如果佇列沒滿,向佇列中添加資料
queuePacket(packet);
} catch (ClosedChannelException ignored) {
}
}
}
DataStreamer.java
void queuePacket(DFSPacketpacket) {
synchronized (dataQueue) {
if (packet == null) return;
packet.addTraceParent(Tracer.getCurrentSpanId());
// 向佇列中添加資料
dataQueue.addLast(packet);
lastQueuedSeqno = packet.getSeqno();
LOG.debug("Queued {}, {}",packet, this);
// 通知佇列添加資料完成
dataQueue.notifyAll();
}
}
** 二、 建立管道之機架感知(塊存盤位置) **
1)點擊create
Ctrl + n全域查找DataStreamer,搜索run方法
DataStreamer.java
@Override
public void run() {
long lastPacket = Time.monotonicNow();
TraceScope scope = null;
while (!streamerClosed &&dfsClient.clientRunning) {
// if the Responder encountered an error, shutdown Responder
if (errorState.hasError()) {
closeResponder();
}
DFSPacket one;
try {
// process datanode IO errors ifany
boolean doSleep =processDatanodeOrExternalError();
final int halfSocketTimeout =dfsClient.getConf().getSocketTimeout()/2;
synchronized (dataQueue) {
// wait for a packet to be sent.
long now = Time.monotonicNow();
while ((!shouldStop() && dataQueue.size() == 0 &&
(stage !=BlockConstructionStage.DATA_STREAMING ||
now - lastPacket < halfSocketTimeout)) ||doSleep) {
long timeout =halfSocketTimeout - (now-lastPacket);
timeout = timeout <= 0 ?1000 : timeout;
timeout = (stage ==BlockConstructionStage.DATA_STREAMING)?
timeout : 1000;
try {
// 如果dataQueue里面沒有資料,代碼會阻塞在這兒
dataQueue.wait(timeout); // 接收到notify訊息
} catch(InterruptedException e) {
LOG.warn("Caught exception", e);
}
doSleep = false;
now = Time.monotonicNow();
}
if (shouldStop()) {
continue;
}
// get packet to be sent.
if (dataQueue.isEmpty()) {
one =createHeartbeatPacket();
} else {
try {
backOffIfNecessary();
} catch(InterruptedException e) {
LOG.warn("Caught exception", e);
}
// 佇列不為空,從佇列中取出packet
one = dataQueue.getFirst(); // regular data packet
SpanId[] parents =one.getTraceParents();
if (parents.length > 0){
scope = dfsClient.getTracer().
newScope("dataStreamer",parents[0]);
scope.getSpan().setParents(parents);
}
}
}
// get new block from namenode.
if (LOG.isDebugEnabled()) {
LOG.debug("stage=" + stage + ", " + this);
}
if (stage ==BlockConstructionStage.PIPELINE_SETUP_CREATE) {
LOG.debug("Allocating new block: {}", this);
// 步驟一:向NameNode 申請block 并建立資料管道
setPipeline(nextBlockOutputStream());
// 步驟二:啟動ResponseProcessor用來監聽packet發送是否成功
initDataStreaming();
} else if (stage ==BlockConstructionStage.PIPELINE_SETUP_APPEND) {
setupPipelineForAppendOrRecovery();
if (streamerClosed) {
continue;
}
initDataStreaming();
}
long lastByteOffsetInBlock =one.getLastByteOffsetBlock();
if (lastByteOffsetInBlock >stat.getBlockSize()) {
throw new IOException("BlockSize " + stat.getBlockSize() +
" < lastByteOffsetInBlock, " +this + ", " + one);
}
… …
// send the packet
SpanId spanId = SpanId.INVALID;
synchronized (dataQueue) {
// move packet from dataQueue to ackQueue
if (!one.isHeartbeatPacket()) {
if (scope != null) {
spanId = scope.getSpanId();
scope.detach();
one.setTraceScope(scope);
}
scope = null;
// 步驟三:從dataQueue 把要發送的這個packet 移除出去
dataQueue.removeFirst();
// 步驟四:然后往ackQueue 里面添加這個packet
ackQueue.addLast(one);
packetSendTime.put(one.getSeqno(),Time.monotonicNow());
dataQueue.notifyAll();
}
}
LOG.debug("{} sending{}", this, one);
// write out data to remotedatanode
try (TraceScope ignored =dfsClient.getTracer().
newScope("DataStreamer#writeTo",spanId)) {
// 將資料寫出去
one.writeTo(blockStream);
blockStream.flush();
} catch (IOException e) {
errorState.markFirstNodeIfNotMarked();
throw e;
}
… …
}
點擊nextBlockOutputStream
protected LocatedBlock nextBlockOutputStream() throws IOException {
LocatedBlock lb;
DatanodeInfo[] nodes;
StorageType[] nextStorageTypes;
String[] nextStorageIDs;
int count =dfsClient.getConf().getNumBlockWriteRetry();
boolean success;
final ExtendedBlock oldBlock =block.getCurrentBlock();
do {
errorState.resetInternalError();
lastException.clear();
DatanodeInfo[] excluded =getExcludedNodes();
// 向NN獲取向哪個DN寫資料
lb = locateFollowingBlock(
excluded.length > 0 ? excluded :null, oldBlock);
// 創建管道
success = createBlockOutputStream(nodes,nextStorageTypes, nextStorageIDs,
0L, false);
… …
} while (!success && --count >=0);
if (!success) {
throw new IOException("Unable to createnew block.");
}
return lb;
}
private LocatedBlock locateFollowingBlock(DatanodeInfo[] excluded,
ExtendedBlock oldBlock) throws IOException{
return DFSOutputStream.addBlock(excluded, dfsClient, src,oldBlock,
stat.getFileId(), favoredNodes,addBlockFlags);
}
static LocatedBlock addBlock(DatanodeInfo[]excludedNodes,
DFSClient dfsClient, String src,ExtendedBlock prevBlock, long fileId,
String[] favoredNodes,EnumSet<AddBlockFlag> allocFlags)
throws IOException {
... ...
//向NN獲取向哪個DN寫資料
return dfsClient.namenode.addBlock(src,dfsClient.clientName, prevBlock,
excludedNodes, fileId,favoredNodes, allocFlags);
... ...
}
LocatedBlock addBlock(String src, String clientName,
ExtendedBlock previous, DatanodeInfo[]excludeNodes, long fileId,
String[] favoredNodes,EnumSet<AddBlockFlag> addBlockFlags)
throws IOException;
ctrl + h 點擊NameNodeRpcServer,在該類中搜索addBlock
NameNodeRpcServer.java
public LocatedBlock addBlock(String src, StringclientName,
ExtendedBlock previous, DatanodeInfo[]excludedNodes, long fileId,
String[] favoredNodes,EnumSet<AddBlockFlag> addBlockFlags)
throws IOException {
checkNNStartup();
LocatedBlock locatedBlock = namesystem.getAdditionalBlock(src, fileId,
clientName, previous, excludedNodes,favoredNodes, addBlockFlags);
if (locatedBlock != null) {
metrics.incrAddBlockOps();
}
return locatedBlock;
}
FSNamesystrm.java
LocatedBlock getAdditionalBlock(
String src, long fileId, String clientName,ExtendedBlock previous,
DatanodeInfo[] excludedNodes, String[]favoredNodes,
EnumSet<AddBlockFlag> flags) throwsIOException {
final String operationName ="getAdditionalBlock";
NameNode.stateChangeLog.debug("BLOCK*getAdditionalBlock: {} inodeId {}"+
" for {}", src, fileId,clientName);
... ...
// 選擇塊存盤位置
DatanodeStorageInfo[] targets =FSDirWriteFileOp.chooseTargetForNewBlock(
blockManager, src, excludedNodes,favoredNodes, flags, r);
... ...
return lb;
}
staticDatanodeStorageInfo[] chooseTargetForNewBlock(
BlockManager bm, String src, DatanodeInfo[]excludedNodes,
String[] favoredNodes,EnumSet<AddBlockFlag> flags,
ValidateAddBlockResult r) throwsIOException {
... ...
return bm.chooseTarget4NewBlock(src, r.numTargets,clientNode,
excludedNodesSet, r.blockSize,
favoredNodesList, r.storagePolicyID,
r.blockType,r.ecPolicy, flags);
}
public DatanodeStorageInfo[] chooseTarget4NewBlock(... ...
) throws IOException {
... ...
final DatanodeStorageInfo[] targets =blockplacement.chooseTarget(src,
numOfReplicas, client, excludedNodes,blocksize,
favoredDatanodeDescriptors,storagePolicy, flags);
... ...
return targets;
}
DatanodeStorageInfo[] chooseTarget(String src,
int numOfReplicas, Node writer,
Set<Node> excludedNodes,
long blocksize,
List<DatanodeDescriptor>favoredNodes,
BlockStoragePolicy storagePolicy,
EnumSet<AddBlockFlag> flags) {
return chooseTarget(src, numOfReplicas, writer,
newArrayList<DatanodeStorageInfo>(numOfReplicas), false,
excludedNodes, blocksize, storagePolicy,flags);
}
public abstract DatanodeStorageInfo[] chooseTarget(String srcPath,
int numOfReplicas,
Node writer,
List<DatanodeStorageInfo> chosen,
boolean returnChosenNodes,
Set<Node> excludedNodes,
long blocksize,
BlockStoragePolicy storagePolicy,
EnumSet<AddBlockFlag>flags);
Crtl + h 查找chooseTarget實作類BlockPlacementPolicyDefault.java
public DatanodeStorageInfo[] chooseTarget(String srcPath,
int numOfReplicas,
Node writer,
List<DatanodeStorageInfo>chosenNodes,
boolean returnChosenNodes,
Set<Node> excludedNodes,
long blocksize,
final BlockStoragePolicy storagePolicy,
EnumSet<AddBlockFlag> flags) {
return chooseTarget(numOfReplicas, writer, chosenNodes,returnChosenNodes,
excludedNodes, blocksize, storagePolicy,flags, null);
}
private DatanodeStorageInfo[]chooseTarget(int numOfReplicas,
Node writer,
List<DatanodeStorageInfo> chosenStorage,
boolean returnChosenNodes,
Set<Node> excludedNodes,
long blocksize,
final BlockStoragePolicy storagePolicy,
EnumSet<AddBlockFlag> addBlockFlags,
EnumMap<StorageType, Integer> sTypes) {
… …
int[] result =getMaxNodesPerRack(chosenStorage.size(), numOfReplicas);
numOfReplicas = result[0];
int maxNodesPerRack = result[1];
for (DatanodeStorageInfo storage :chosenStorage) {
// add localMachine and related nodes toexcludedNodes
// 獲取不可用的DN
addToExcludedNodes(storage.getDatanodeDescriptor(),excludedNodes);
}
List<DatanodeStorageInfo> results =null;
Node localNode = null;
boolean avoidStaleNodes = (stats != null
&&stats.isAvoidingStaleDataNodesForWrite());
//
boolean avoidLocalNode = (addBlockFlags !=null
&&addBlockFlags.contains(AddBlockFlag.NO_LOCAL_WRITE)
&& writer != null
&&!excludedNodes.contains(writer));
// Attempt to exclude local node if theclient suggests so. If no enough
// nodes can be obtained, it falls back tothe default block placement
// policy.
// 有資料正在寫,避免都寫入本地
if (avoidLocalNode) {
results = new ArrayList<>(chosenStorage);
Set<Node> excludedNodeCopy = new HashSet<>(excludedNodes);
if (writer != null) {
excludedNodeCopy.add(writer);
}
localNode = chooseTarget(numOfReplicas,writer,
excludedNodeCopy, blocksize,maxNodesPerRack, results,
avoidStaleNodes, storagePolicy,
EnumSet.noneOf(StorageType.class),results.isEmpty(), sTypes);
if (results.size() < numOfReplicas) {
// not enough nodes; discard results andfall back
results = null;
}
}
if (results == null) {
results = newArrayList<>(chosenStorage);
// 真正的選擇DN節點
localNode = chooseTarget(numOfReplicas, writer, excludedNodes,
blocksize, maxNodesPerRack, results,avoidStaleNodes,
storagePolicy,EnumSet.noneOf(StorageType.class), results.isEmpty(),
sTypes);
}
if (!returnChosenNodes) {
results.removeAll(chosenStorage);
}
// sorting nodes to form a pipeline
return getPipeline(
(writer != null && writerinstanceof DatanodeDescriptor) ? writer
: localNode,
results.toArray(newDatanodeStorageInfo[results.size()]));
}
private Node chooseTarget(int numOfReplicas,
... ...) {
writer = chooseTargetInOrder(numOfReplicas, writer,excludedNodes, blocksize,
maxNodesPerRack, results,avoidStaleNodes, newBlock, storageTypes);
... ...
}
protected Node chooseTargetInOrder(int numOfReplicas,
Node writer,
final Set<Node> excludedNodes,
final longblocksize,
final intmaxNodesPerRack,
final List<DatanodeStorageInfo> results,
final booleanavoidStaleNodes,
final booleannewBlock,
EnumMap<StorageType, Integer> storageTypes)
throws NotEnoughReplicasException {
final int numOfResults = results.size();
if (numOfResults == 0) {
// 第一個塊存盤在當前節點
DatanodeStorageInfo storageInfo = chooseLocalStorage(writer,
excludedNodes, blocksize,maxNodesPerRack, results, avoidStaleNodes,
storageTypes, true);
writer = (storageInfo != null) ?storageInfo.getDatanodeDescriptor()
: null;
if (--numOfReplicas == 0) {
return writer;
}
}
final DatanodeDescriptor dn0 =results.get(0).getDatanodeDescriptor();
// 第二個塊存盤在另外一個機架
if (numOfResults <= 1) {
chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes, storageTypes);
if (--numOfReplicas == 0) {
return writer;
}
}
if (numOfResults <= 2) {
final DatanodeDescriptor dn1 =results.get(1).getDatanodeDescriptor();
// 如果第一個和第二個在同一個機架,那么第三個放在其他機架
if (clusterMap.isOnSameRack(dn0, dn1)) {
chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes,storageTypes);
} else if (newBlock){
// 如果是新塊,和第二個塊存盤在同一個機架
chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes,storageTypes);
} else {
// 如果不是新塊,放在當前機架
chooseLocalRack(writer,excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes,storageTypes);
}
if (--numOfReplicas == 0) {
return writer;
}
}
chooseRandom(numOfReplicas, NodeBase.ROOT,excludedNodes, blocksize,
maxNodesPerRack, results,avoidStaleNodes, storageTypes);
return writer;
}
** 三、 建立管道之Socket發送 **
NN處理完DN請求后,再次回到DN端,啟動對應的執行緒
點擊nextBlockOutputStream
protected LocatedBlock nextBlockOutputStream() throws IOException {
LocatedBlock lb;
DatanodeInfo[] nodes;
StorageType[] nextStorageTypes;
String[] nextStorageIDs;
int count =dfsClient.getConf().getNumBlockWriteRetry();
boolean success;
final ExtendedBlock oldBlock =block.getCurrentBlock();
do {
errorState.resetInternalError();
lastException.clear();
DatanodeInfo[] excluded =getExcludedNodes();
// 向NN獲取向哪個DN寫資料
lb = locateFollowingBlock(
excluded.length > 0 ? excluded :null, oldBlock);
// 創建管道
success = createBlockOutputStream(nodes, nextStorageTypes,nextStorageIDs,
0L, false);
… …
} while (!success && --count >=0);
if (!success) {
throw new IOException("Unable tocreate new block.");
}
return lb;
}
boolean createBlockOutputStream(DatanodeInfo[]nodes,
StorageType[] nodeStorageTypes, String[]nodeStorageIDs,
long newGS, boolean recoveryFlag) {
... ...
// 和DN創建socket
s = createSocketForPipeline(nodes[0],nodes.length, dfsClient);
// 獲取輸出流,用于寫資料到DN
OutputStreamunbufOut =NetUtils.getOutputStream(s, writeTimeout);
// 獲取輸入流,用于讀取寫資料到DN的結果
InputStreamunbufIn = NetUtils.getInputStream(s,readTimeout);
IOStreamPair saslStreams =dfsClient.saslClient.socketSend(s,
unbufOut, unbufIn, dfsClient,accessToken, nodes[0]);
unbufOut = saslStreams.out;
unbufIn = saslStreams.in;
out = new DataOutputStream(new BufferedOutputStream(unbufOut,
DFSUtilClient.getSmallBufferSize(dfsClient.getConfiguration())));
blockReplyStream = new DataInputStream(unbufIn);
// 發送資料
new Sender(out).writeBlock(blockCopy,nodeStorageTypes[0], accessToken,
dfsClient.clientName, nodes,nodeStorageTypes, null, bcs,
nodes.length, block.getNumBytes(),bytesSent, newGS,
checksum4WriteBlock,cachingStrategy.get(), isLazyPersistFile,
(targetPinnings != null &&targetPinnings[0]), targetPinnings,
nodeStorageIDs[0], nodeStorageIDs);
... ...
}
public void writeBlock(... ...) throws IOException{
... ...
send(out, Op.WRITE_BLOCK, proto.build());
}
** 四、 建立管道之Socket接收 **
1)點擊create
Ctrl +n 全域查找DataXceiverServer.java,在該類中查找run方法
public void run(){
Peer peer = null;
while (datanode.shouldRun &&!datanode.shutdownForUpgrade) {
try {
// 接收socket的請求
peer = peerServer.accept();
// Make sure the xceiver count is notexceeded
int curXceiverCount =datanode.getXceiverCount();
if (curXceiverCount > maxXceiverCount){
throw new IOException("Xceivercount " + curXceiverCount
+ " exceeds the limit ofconcurrent xcievers: "
+ maxXceiverCount);
}
// 客戶端每發送一個block,都啟動一個DataXceiver去處理block
new Daemon(datanode.threadGroup,
DataXceiver.create(peer, datanode, this))
.start();
} catch (SocketTimeoutException ignored) {
... ...
}
}
... ...
}
點擊DataXceiver(執行緒),查找run方法
public void run() {
int opsProcessed = 0;
Op op = null;
try {
synchronized(this) {
xceiver = Thread.currentThread();
}
dataXceiverServer.addPeer(peer,Thread.currentThread(), this);
peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout);
InputStream input = socketIn;
try {
IOStreamPair saslStreams =datanode.saslServer.receive(peer, socketOut,
socketIn,datanode.getXferAddress().getPort(),
return;
}
super.initialize(newDataInputStream(input));
do {
updateCurrentThreadName("Waiting foroperation #" + (opsProcessed + 1));
try {
if (opsProcessed != 0) {
assert dnConf.socketKeepaliveTimeout> 0;
peer.setReadTimeout(dnConf.socketKeepaliveTimeout);
} else {
peer.setReadTimeout(dnConf.socketTimeout);
}
// 讀取這次資料的請求型別
op = readOp();
} catch (InterruptedIOException ignored){
// Time out while we wait for clientrpc
break;
} catch (EOFException |ClosedChannelException e) {
// Since we optimistically expect thenext op, it's quite normal to
// get EOF here.
LOG.debug("Cached {} closing after {}ops. " +
"This message is usuallybenign.", peer, opsProcessed);
break;
} catch (IOException err) {
incrDatanodeNetworkErrors();
throw err;
}
// restore normal timeout
if (opsProcessed != 0) {
peer.setReadTimeout(dnConf.socketTimeout);
}
opStartTime = monotonicNow();
// 根據操作型別處理我們的資料
processOp(op);
++opsProcessed;
} while ((peer != null) &&
(!peer.isClosed() &&dnConf.socketKeepaliveTimeout > 0));
} catch (Throwable t) {
... ...
}
}
protected finalvoid processOp(Op op) throws IOException {
switch(op) {
... ...
case WRITE_BLOCK:
opWriteBlock(in);
break;
... ...
default:
throw new IOException("Unknown op" + op + " in data stream");
}
}
private void opWriteBlock(DataInputStream in) throwsIOException {
final OpWriteBlockProto proto =OpWriteBlockProto.parseFrom(vintPrefixed(in));
final DatanodeInfo[] targets = PBHelperClient.convert(proto.getTargetsList());
TraceScope traceScope =continueTraceSpan(proto.getHeader(),
proto.getClass().getSimpleName());
try {
writeBlock(PBHelperClient.convert(proto.getHeader().getBaseHeader().getBlock()),
PBHelperClient.convertStorageType(proto.getStorageType()),
PBHelperClient.convert(proto.getHeader().getBaseHeader().getToken()),
proto.getHeader().getClientName(),
targets,
PBHelperClient.convertStorageTypes(proto.getTargetStorageTypesList(),targets.length),
PBHelperClient.convert(proto.getSource()),
fromProto(proto.getStage()),
proto.getPipelineSize(),
proto.getMinBytesRcvd(),proto.getMaxBytesRcvd(),
proto.getLatestGenerationStamp(),
fromProto(proto.getRequestedChecksum()),
(proto.hasCachingStrategy() ?
getCachingStrategy(proto.getCachingStrategy()) :
CachingStrategy.newDefaultStrategy()),
(proto.hasAllowLazyPersist() ?proto.getAllowLazyPersist() : false),
(proto.hasPinning() ?proto.getPinning(): false),
(PBHelperClient.convertBooleanList(proto.getTargetPinningsList())),
proto.getStorageId(),
proto.getTargetStorageIdsList().toArray(new String[0]));
} finally {
if (traceScope != null) traceScope.close();
}
}
Ctrl +alt +b 查找writeBlock的實作類DataXceiver.java
public void writeBlock(... ...) throws IOException{
... ...
try {
final Replica replica;
if (isDatanode ||
stage !=BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) {
// open a block receiver
// 創建一個BlockReceiver
setCurrentBlockReceiver(getBlockReceiver(block, storageType, in,
peer.getRemoteAddressString(),
peer.getLocalAddressString(),
stage, latestGenerationStamp,minBytesRcvd, maxBytesRcvd,
clientname, srcDataNode, datanode,requestedChecksum,
cachingStrategy, allowLazyPersist,pinning, storageId));
replica = blockReceiver.getReplica();
} else {
replica = datanode.data.recoverClose(
block, latestGenerationStamp,minBytesRcvd);
}
storageUuid = replica.getStorageUuid();
isOnTransientStorage = replica.isOnTransientStorage();
//
// Connect to downstream machine, ifappropriate
// 繼續連接下游的機器
if (targets.length > 0) {
InetSocketAddress mirrorTarget = null;
// Connect to backup machine
mirrorNode =targets[0].getXferAddr(connectToDnViaHostname);
LOG.debug("Connecting to datanode{}", mirrorNode);
mirrorTarget =NetUtils.createSocketAddr(mirrorNode);
// 向新的副本發送socket
mirrorSock = datanode.newSocket();
try {
... ...
if (targetPinnings != null &&targetPinnings.length > 0) {
// 往下游socket發送資料
new Sender(mirrorOut).writeBlock(originalBlock,targetStorageTypes[0],
blockToken, clientname, targets,targetStorageTypes,
srcDataNode, stage, pipelineSize,minBytesRcvd, maxBytesRcvd,
latestGenerationStamp,requestedChecksum, cachingStrategy,
allowLazyPersist,targetPinnings[0], targetPinnings,
targetStorageId, targetStorageIds);
} else {
new Sender(mirrorOut).writeBlock(originalBlock, targetStorageTypes[0],
blockToken, clientname, targets,targetStorageTypes,
srcDataNode, stage, pipelineSize,minBytesRcvd, maxBytesRcvd,
latestGenerationStamp,requestedChecksum, cachingStrategy,
allowLazyPersist, false,targetPinnings,
targetStorageId,targetStorageIds);
}
mirrorOut.flush();
DataNodeFaultInjector.get().writeBlockAfterFlush();
// read connect ack (only for clients,not for replication req)
if (isClient) {
BlockOpResponseProto connectAck =
BlockOpResponseProto.parseFrom(PBHelperClient.vintPrefixed(mirrorIn));
mirrorInStatus =connectAck.getStatus();
firstBadLink =connectAck.getFirstBadLink();
if (mirrorInStatus != SUCCESS) {
LOG.debug("Datanode {} gotresponse for connect" +
"ack from downstream datanode with firstbadlink as{}",
targets.length, firstBadLink);
}
}
… …
//update metrics
datanode.getMetrics().addWriteBlockOp(elapsed());
datanode.getMetrics().incrWritesFromClient(peer.isLocal(),size);
}
BlockReceiver getBlockReceiver(
final ExtendedBlock block, finalStorageType storageType,
final DataInputStream in,
final String inAddr, final String myAddr,
final BlockConstructionStage stage,
final long newGs, final long minBytesRcvd,final long maxBytesRcvd,
final String clientname, final DatanodeInfosrcDataNode,
final DataNode dn, DataChecksumrequestedChecksum,
CachingStrategy cachingStrategy,
final boolean allowLazyPersist,
final boolean pinning,
final String storageId) throws IOException{
return new BlockReceiver(block, storageType, in,
inAddr, myAddr, stage, newGs,minBytesRcvd, maxBytesRcvd,
clientname, srcDataNode, dn,requestedChecksum,
cachingStrategy, allowLazyPersist,pinning, storageId);
}
BlockReceiver(final ExtendedBlock block,final StorageType storageType,
final DataInputStream in,
final String inAddr, final String myAddr,
final BlockConstructionStage stage,
final long newGs, final long minBytesRcvd,final long maxBytesRcvd,
final String clientname, final DatanodeInfosrcDataNode,
final DataNode datanode, DataChecksumrequestedChecksum,
CachingStrategy cachingStrategy,
final boolean allowLazyPersist,
final boolean pinning,
final String storageId) throws IOException {
... ...
if (isDatanode) { //replication or move
replicaHandler =
datanode.data.createTemporary(storageType, storageId, block, false);
} else {
switch (stage) {
case PIPELINE_SETUP_CREATE:
// 創建管道
replicaHandler = datanode.data.createRbw(storageType, storageId,
block, allowLazyPersist);
datanode.notifyNamenodeReceivingBlock(
block,replicaHandler.getReplica().getStorageUuid());
break;
... ...
default: throw newIOException("Unsupported stage " + stage +
" while receiving block " +block + " from " + inAddr);
}
}
... ...
}
public ReplicaHandler createRbw(
StorageType storageType, String storageId,ExtendedBlock b,
boolean allowLazyPersist) throwsIOException {
try (AutoCloseableLock lock =datasetLock.acquire()) {
... ...
if (ref == null) {
ref = volumes.getNextVolume(storageType,storageId, b.getNumBytes());
}
FsVolumeImpl v = (FsVolumeImpl)ref.getVolume();
// create an rbw file to hold block in thedesignated volume
if (allowLazyPersist &&!v.isTransientStorage()) {
datanode.getMetrics().incrRamDiskBlocksWriteFallback();
}
ReplicaInPipeline newReplicaInfo;
try {
// 創建輸出流的臨時寫檔案
new ReplicaInfo = v.createRbw(b);
if(new ReplicaInfo.getReplicaInfo().getState() != ReplicaState.RBW) {
throw new IOException("CreateRBWreturned a replica of state "
+new ReplicaInfo.getReplicaInfo().getState()
+ " for block " +b.getBlockId());
}
} catch (IOException e) {
IOUtils.cleanup(null, ref);
throw e;
}
volumeMap.add(b.getBlockPoolId(),newReplicaInfo.getReplicaInfo());
return new ReplicaHandler(newReplicaInfo,ref);
}
}
public ReplicaHandler createRbw(
StorageType storageType, String storageId,ExtendedBlock b,
boolean allowLazyPersist) throwsIOException {
try (AutoCloseableLock lock =datasetLock.acquire()) {
... ...
if (ref == null) {
// 有可能有多個臨時寫檔案
ref = volumes.getNextVolume(storageType, storageId,b.getNumBytes());
}
FsVolumeImpl v = (FsVolumeImpl)ref.getVolume();
// create an rbw file to hold block in thedesignated volume
if (allowLazyPersist &&!v.isTransientStorage()) {
datanode.getMetrics().incrRamDiskBlocksWriteFallback();
}
ReplicaInPipeline newReplicaInfo;
try {
// 創建輸出流的臨時寫檔案
newReplicaInfo = v.createRbw(b);
if(newReplicaInfo.getReplicaInfo().getState() != ReplicaState.RBW) {
throw new IOException("CreateRBWreturned a replica of state "
+ newReplicaInfo.getReplicaInfo().getState()
+ " for block " +b.getBlockId());
}
} catch (IOException e) {
IOUtils.cleanup(null, ref);
throw e;
}
volumeMap.add(b.getBlockPoolId(), newReplicaInfo.getReplicaInfo());
return new ReplicaHandler(newReplicaInfo,ref);
}
}
public ReplicaInPipeline createRbw(ExtendedBlock b)throws IOException {
File f = createRbwFile(b.getBlockPoolId(), b.getLocalBlock());
LocalReplicaInPipeline newReplicaInfo = newReplicaBuilder(ReplicaState.RBW)
.setBlockId(b.getBlockId())
.setGenerationStamp(b.getGenerationStamp())
.setFsVolume(this)
.setDirectoryToUse(f.getParentFile())
.setBytesToReserve(b.getNumBytes())
.buildLocalReplicaInPipeline();
return newReplicaInfo;
}
** 五、 客戶端接收DN寫資料應答Response **
NN處理完DN請求后,再次回到DN端,啟動對應的執行緒
Ctrl + n全域查找DataStreamer,搜索run方法
DataStreamer.java
@Override
public void run() {
long lastPacket = Time.monotonicNow();
TraceScope scope = null;
while (!streamerClosed &&dfsClient.clientRunning) {
// if the Responder encountered an error, shutdown Responder
if (errorState.hasError()) {
closeResponder();
}
DFSPacket one;
try {
// process datanode IO errors ifany
boolean doSleep =processDatanodeOrExternalError();
final int halfSocketTimeout =dfsClient.getConf().getSocketTimeout()/2;
synchronized (dataQueue) {
// wait for a packet to be sent.
long now = Time.monotonicNow();
while ((!shouldStop() && dataQueue.size() == 0 &&
(stage !=BlockConstructionStage.DATA_STREAMING ||
now - lastPacket < halfSocketTimeout)) ||doSleep) {
long timeout =halfSocketTimeout - (now-lastPacket);
timeout = timeout <= 0 ?1000 : timeout;
timeout = (stage ==BlockConstructionStage.DATA_STREAMING)?
timeout : 1000;
try {
// 如果dataQueue里面沒有資料,代碼會阻塞在這兒
dataQueue.wait(timeout); // 接收到notify訊息
} catch(InterruptedException e) {
LOG.warn("Caught exception", e);
}
doSleep = false;
now = Time.monotonicNow();
}
if (shouldStop()) {
continue;
}
// get packet to be sent.
if (dataQueue.isEmpty()) {
one =createHeartbeatPacket();
} else {
try {
backOffIfNecessary();
} catch(InterruptedException e) {
LOG.warn("Caught exception", e);
}
// 佇列不為空,從佇列中取出packet
one = dataQueue.getFirst(); // regular data packet
SpanId[] parents =one.getTraceParents();
if (parents.length > 0){
scope = dfsClient.getTracer().
newScope("dataStreamer",parents[0]);
scope.getSpan().setParents(parents);
}
}
}
// get new block from namenode.
if (LOG.isDebugEnabled()) {
LOG.debug("stage=" + stage + ", " + this);
}
if (stage ==BlockConstructionStage.PIPELINE_SETUP_CREATE) {
LOG.debug("Allocating new block: {}", this);
// 步驟一:向NameNode 申請block 并建立資料管道
setPipeline(nextBlockOutputStream());
// 步驟二:啟動ResponseProcessor用來監聽packet發送是否成功
initDataStreaming();
} else if (stage ==BlockConstructionStage.PIPELINE_SETUP_APPEND) {
LOG.debug("Append to block {}", block);
setupPipelineForAppendOrRecovery();
if (streamerClosed) {
continue;
}
initDataStreaming();
}
long lastByteOffsetInBlock =one.getLastByteOffsetBlock();
if (lastByteOffsetInBlock >stat.getBlockSize()) {
throw new IOException("BlockSize " + stat.getBlockSize() +
" < lastByteOffsetInBlock, " +this + ", " + one);
}
if (one.isLastPacketInBlock()) {
// wait for all data packets have been successfully acked
synchronized (dataQueue) {
while (!shouldStop()&& ackQueue.size() != 0) {
try {
// wait for acks toarrive from datanodes
dataQueue.wait(1000);
} catch (InterruptedException e) {
LOG.warn("Caughtexception", e);
}
}
}
if (shouldStop()) {
continue;
}
stage = BlockConstructionStage.PIPELINE_CLOSE;
}
// send the packet
SpanId spanId = SpanId.INVALID;
synchronized (dataQueue) {
// move packet from dataQueue to ackQueue
if (!one.isHeartbeatPacket()) {
if (scope != null) {
spanId = scope.getSpanId();
scope.detach();
one.setTraceScope(scope);
}
scope = null;
// 步驟三:從dataQueue 把要發送的這個packet 移除出去
dataQueue.removeFirst();
// 步驟四:然后往ackQueue 里面添加這個packet
ackQueue.addLast(one);
packetSendTime.put(one.getSeqno(),Time.monotonicNow());
dataQueue.notifyAll();
}
}
LOG.debug("{} sending{}", this, one);
// write out data to remotedatanode
try (TraceScope ignored =dfsClient.getTracer().
newScope("DataStreamer#writeTo",spanId)) {
// 將資料寫出去
one.writeTo(blockStream);
blockStream.flush();
} catch (IOException e) {
errorState.markFirstNodeIfNotMarked();
throw e;
}
lastPacket = Time.monotonicNow();
// update bytesSent
long tmpBytesSent = one.getLastByteOffsetBlock();
if (bytesSent < tmpBytesSent) {
bytesSent = tmpBytesSent;
}
if (shouldStop()) {
continue;
}
// Is this block full?
if (one.isLastPacketInBlock()) {
// wait for the close packet has been acked
synchronized (dataQueue) {
while (!shouldStop()&& ackQueue.size() != 0) {
dataQueue.wait(1000);// wait for acks toarrive from datanodes
}
}
if (shouldStop()) {
continue;
}
endBlock();
}
if (progress != null) { progress.progress();}
// This is used by unit test totrigger race conditions.
if (artificialSlowdown != 0&& dfsClient.clientRunning) {
Thread.sleep(artificialSlowdown);
}
}catch (Throwable e) {
... ...
}finally {
if (scope != null) {
scope.close();
scope = null;
}
}
}
closeInternal();
}
private void initDataStreaming() {
this.setName("DataStreamer for file" + src +
" block " + block);
... ...
response = new ResponseProcessor(nodes);
response.start();
stage =BlockConstructionStage.DATA_STREAMING;
}
點擊response再點擊ResponseProcessor,ctrl + f 查找run方法
public void run(){
... ...
ackQueue.removeFirst();
packetSendTime.remove(seqno);
dataQueue.notifyAll();
... ...
}
相關文章:
Hadoop之HDFS的I/O流操作
Hadoop(HDFS)之 資料完整性
Hadoop(HDFS)之CheckPoint時間設定
Hadoop小試牛刀——HDFS集群壓測
Hadoop運維工具箱之HDFS異構存盤
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/467014.html
標籤:其他
