我在测试通过 fleet 在 "intel_gpu" 单机多卡的机器上训练 DDP RN50。forward 正常,backward fail 在一个错误的 memcpy 上:
......
I0605 17:04:43.523797 1570317 dygraph_functions.cc:39262] Finish AD API: gaussian
I0605 17:04:43.523912 1570317 dygraph_functions.cc:39276] { Input: [],
Output: [
( out , [{Name: None, Initialized: 1, Ptr: 0x72edc60 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512, 512, 3, 3 ], ADInfo:[ None ]}]), ] }
I0605 17:04:43.524639 1570317 eager.cc:653] args_num: 5
I0605 17:04:43.524663 1570317 eager.cc:823] Calling case2's initializer.
I0605 17:04:43.524729 1570317 grad_node_info.cc:64] Construct GradNodeBase
I0605 17:04:43.524763 1570317 accumulation_node.h:27] Construct GradNodeAccumulation
I0605 17:04:43.524791 1570317 eager.cc:107] Tensor(batch_norm2d_48.w_0) have not GradNode, add GradNodeAccumulation0x85cdae0 for it.
I0605 17:04:43.524863 1570317 eager_properties.cc:198] eager_properties 'Shape' method, layout autotune desired_layout: Undefined(AnyLayout) default_layout: Undefined(AnyLayout) tensor layout: NCHW tensor's shape size is : 1
I0605 17:04:43.524895 1570317 eager_op_function.cc:19529] Running Eager Final State API: full_
I0605 17:04:43.524907 1570317 eager_op_function.cc:19531] args count: 2
I0605 17:04:43.524927 1570317 eager_utils.cc:1424] type_name: str
I0605 17:04:43.525002 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:04:43.525034 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:04:43.525020 1570317 eager_op_function.cc:19562] CurrentDeviceId: 0 from 0
I0605 17:04:43.525058 1570317 dygraph_functions.cc:38666] Running AD API: full_
I0605 17:04:43.525069 1570317 dygraph_functions.cc:38672] No AMP for full__ad_func because it is a inplace or cast api.
I0605 17:04:43.525080 1570317 dygraph_functions.cc:38692] Running C++ API: full_
I0605 17:04:43.525146 1570317 dygraph_functions.cc:38703] { Input: [
( output , [{Name: batch_norm2d_48.w_0, Initialized: 0, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: Unknown, Place: Unknown, Shape: Unknown ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ]}
I0605 17:04:43.525171 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:04:43.525214 1570317 api.cc:26073] full_ API kernel key: [intel_gpu, NCHW, float32]
I0605 17:04:43.525262 1570317 api.cc:26080] full kernel: {"input":[],"output":["intel_gpu, NCHW, float32"],"attribute":["IntArray","Scalar","DataType"]}
I0605 17:04:43.525293 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:04:43.525385 1570317 full_kernel.cc:25] FullValue type=float
I0605 17:04:43.525417 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:04:43.525431 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:04:43.525476 1570317 runtime.cc:234] request allocate size=2048 device=0
I0605 17:04:43.525560 1570317 runtime.cc:258] allocate success size=2048 left=1765120799
I0605 17:04:43.525624 1570317 auto_growth_best_fit_allocator.cc:118] Not found and reallocate 2048(0xffff8181fe020000), and remaining 0
I0605 17:04:43.525640 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181fe020000
I0605 17:04:43.525681 1570317 full_kernel.cc:29] FullValue size=512 sizeof(T)=4
I0605 17:04:43.526175 1570317 dygraph_functions.cc:38717] Finish AD API: full_
I0605 17:04:43.526319 1570317 dygraph_functions.cc:38734] { Input: [
( output , [{Name: batch_norm2d_48.w_0, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ],
Output: [
( out , [{Name: batch_norm2d_48.w_0, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ] }
......
I0605 17:04:46.935668 1570317 reducer.cc:101] var[conv2d_48.w_0] 's type is float32
I0605 17:04:46.935672 1570317 reducer.cc:101] var[batch_norm2d_48.w_0] 's type is float32
I0605 17:04:46.935675 1570317 reducer.cc:101] var[batch_norm2d_48.b_0] 's type is float32
I0605 17:04:46.935679 1570317 reducer.cc:101] var[conv2d_49.w_0] 's type is float32
......
I0605 17:05:02.728678 1570317 eager_op_function.cc:16839] Running Eager Final State API: batch_norm
I0605 17:05:02.728682 1570317 eager_op_function.cc:16841] args count: 5
I0605 17:05:02.728744 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.728756 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.728751 1570317 eager_op_function.cc:16881] CurrentDeviceId: 0 from 0
I0605 17:05:02.728763 1570317 dygraph_functions.cc:33987] Running AD API: batch_norm
I0605 17:05:02.728767 1570317 dygraph_functions.cc:34050] Running C++ API: batch_norm
I0605 17:05:02.728855 1570317 dygraph_functions.cc:34073] { Input: [
( x , [{Name: None, Initialized: 1, Ptr: 0x1a0755d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [2]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1923ac40, ReluGradNode] }, ]SlotID: 1, StopGradients: 0, , Edges[ { [0, 0]: [0x72ed8a0, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( mean , [{Name: batch_norm2d_48.w_1, Initialized: 1, Ptr: 0x85cf840 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 1 ] ]}]),
( variance , [{Name: batch_norm2d_48.w_2, Initialized: 1, Ptr: 0x82d3300 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 1 ] ]}]),
( scale , [{Name: batch_norm2d_48.w_0, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( bias , [{Name: batch_norm2d_48.b_0, Initialized: 1, Ptr: 0x85cec70 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ]}
I0605 17:05:02.728883 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.728912 1570317 api.cc:22540] batch_norm API kernel key: [intel_gpu, NCHW, float32]
I0605 17:05:02.728945 1570317 api.cc:22547] batch_norm kernel: {"input":["intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32"],"output":["intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32"],"attribute":["bool","float","float","string","bool","bool"]}
I0605 17:05:02.728962 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.729009 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.729014 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:02.729025 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.729063 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:02.729077 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.729082 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.729099 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.729101 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0ca000
I0605 17:05:02.729135 1570317 tensor_utils.cc:97] src:0x85e0000, dst:0xffff8181ff0ca000
I0605 17:05:02.729149 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0x85e0000) to Place(intel_gpu:0)(0xffff8181ff0ca000), stream=0
I0605 17:05:02.729158 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.729244 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.729259 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:02.729274 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181ff0ca000 src=0x85e0000 size=2048
I0605 17:05:02.729657 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.729671 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:02.729679 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.729691 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:02.729701 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.729704 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.729713 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.729717 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0ca800
I0605 17:05:02.729736 1570317 tensor_utils.cc:97] src:0x8841000, dst:0xffff8181ff0ca800
I0605 17:05:02.729748 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0x8841000) to Place(intel_gpu:0)(0xffff8181ff0ca800), stream=0
I0605 17:05:02.729758 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.729801 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.729820 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:02.729831 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181ff0ca800 src=0x8841000 size=2048
I0605 17:05:02.730211 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.730233 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:02.730244 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.730258 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:02.730268 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.730271 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.730279 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.730283 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cc000
I0605 17:05:02.730296 1570317 tensor_utils.cc:97] src:0x7c1a000, dst:0xffff8181ff0cc000
I0605 17:05:02.730306 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0x7c1a000) to Place(intel_gpu:0)(0xffff8181ff0cc000), stream=0
I0605 17:05:02.730315 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.730357 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.730368 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:02.730379 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181ff0cc000 src=0x7c1a000 size=2048
I0605 17:05:02.730762 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:02.730785 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:02.730795 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.730809 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:02.730818 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.730823 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.730829 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.730832 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cc800
I0605 17:05:02.730845 1570317 tensor_utils.cc:97] src:0xe650000, dst:0xffff8181ff0cc800
I0605 17:05:02.730856 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0xe650000) to Place(intel_gpu:0)(0xffff8181ff0cc800), stream=0
I0605 17:05:02.730866 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:02.730907 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:02.730918 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:02.730928 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181ff0cc800 src=0xe650000 size=2048
I0605 17:05:02.731307 1570317 api.cc:22582] Perform View between Output and Input Tensor, share allocation and inplace version.
I0605 17:05:02.731328 1570317 api.cc:22586] Perform View between Output and Input Tensor, share allocation and inplace version.
I0605 17:05:02.731400 1570317 dense_tensor.cc:139] Allocate data with bytes: 3211264
I0605 17:05:02.731405 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 3211264 bytes, aligned to 3211264
I0605 17:05:02.731415 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 3211264 bytes from chunk size 4194304, remaining 983040
I0605 17:05:02.731431 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 3211264 bytes, ptr = 0xffff81d5fdaf0000
I0605 17:05:02.731453 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.731457 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.731462 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.731464 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cd000
I0605 17:05:02.731470 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:02.731473 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:02.731477 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:02.731479 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cd800
I0605 17:05:02.731565 1570317 dense_tensor.cc:139] Allocate data with bytes: 200832
I0605 17:05:02.731571 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 200832 bytes, aligned to 200832
I0605 17:05:02.731577 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 200832 bytes from chunk size 262144, remaining 61312
I0605 17:05:02.731585 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 200832 bytes, ptr = 0xffff8181febeef80
onednn_verbose,exec,gpu:0,batch_normalization,ocl:ref:any,forward_training,data_f32::blocked:abcd:f0 diff_undef::undef::,attr-scratchpad:user ,flags:CH,mb32ic512ih7iw7,0.275146
I0605 17:05:02.732051 1570317 auto_growth_best_fit_allocator.cc:131] Free 200832 bytes, ptr = 0xffff8181febeef80
I0605 17:05:02.732097 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0cc800
I0605 17:05:02.732105 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0cc000
I0605 17:05:02.732123 1570317 grad_node_info.cc:64] Construct GradNodeBase
I0605 17:05:02.732144 1570317 grad_node_info.cc:238] Add Edges for slot: 0, the Edge is from BatchNormGradNode (addr: 0x198b7070) to Conv2dGradNodeFinal (addr: 0x1a366ff0)
I0605 17:05:02.732151 1570317 grad_node_info.h:77] Reseting Edge's Grad Node
I0605 17:05:02.732168 1570317 grad_node_info.cc:238] Add Edges for slot: 3, the Edge is from BatchNormGradNode (addr: 0x198b7070) to GradNodeAccumulation (addr: 0x85cdae0)
I0605 17:05:02.732172 1570317 grad_node_info.h:77] Reseting Edge's Grad Node
I0605 17:05:02.732177 1570317 grad_node_info.cc:238] Add Edges for slot: 4, the Edge is from BatchNormGradNode (addr: 0x198b7070) to GradNodeAccumulation (addr: 0x85cf000)
I0605 17:05:02.732182 1570317 grad_node_info.h:77] Reseting Edge's Grad Node
I0605 17:05:02.732187 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732193 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732198 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732203 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732208 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732213 1570317 grad_node_info.cc:86] Set GradSlotMeta for Grad Inputs
I0605 17:05:02.732218 1570317 grad_node_info.cc:106] Skip Configuring GradSlotMeta for uninitialized GradInput Tensor
I0605 17:05:02.732223 1570317 dygraph_functions.cc:34183] Finish AD API: batch_norm
I0605 17:05:02.732421 1570317 dygraph_functions.cc:34224] { Input: [
( x , [{Name: None, Initialized: 1, Ptr: 0x1a0755d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [2]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1923ac40, ReluGradNode] }, ]SlotID: 1, StopGradients: 0, , Edges[ { [0, 0]: [0x72ed8a0, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( mean , [{Name: batch_norm2d_48.w_1, Initialized: 1, Ptr: 0x85cf840 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 1 ] ]}]),
( variance , [{Name: batch_norm2d_48.w_2, Initialized: 1, Ptr: 0x82d3300 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 1 ] ]}]),
( scale , [{Name: batch_norm2d_48.w_0, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( bias , [{Name: batch_norm2d_48.b_0, Initialized: 1, Ptr: 0x85cec70 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ],
Output: [
( out , [{Name: None, Initialized: 1, Ptr: 0x19561630 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( mean_out , [{Name: None, Initialized: 1, Ptr: 0x1a19dc90 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( variance_out , [{Name: None, Initialized: 1, Ptr: 0x18621490 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_mean , [{Name: None, Initialized: 1, Ptr: 0x188de050 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_variance , [{Name: None, Initialized: 1, Ptr: 0x19dce0f0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( reserve_space , [{Name: None, Initialized: 0, Ptr: 0x19f979d0 TensorInfo: [ Type: DenseTensor, Dtype: Unknown, Place: Unknown, Shape: Unknown ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ] }
I0605 17:05:02.732524 1570317 eager_op_function.cc:11066] Running Eager Final State API: relu
......
I0605 17:05:07.889714 1570317 nodes.cc:14271] Finish AD API GRAD: relu_grad
I0605 17:05:07.889755 1570317 nodes.cc:14288] { Input: [
( grad_out , [{Name: None, Initialized: 1, Ptr: 0x7cda990 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]),
( out , [{Name: @Saved, Initialized: 1, Ptr: 0x191d79d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x198b7070, BatchNormGradNode] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ],
Output: [
( grad_x , [{Name: None, Initialized: 1, Ptr: 0x19838cd0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]), ] }
I0605 17:05:07.889763 1570317 backward.cc:283] retain_graph is false, need to clear the TensorWrapper of nodes.
I0605 17:05:07.889771 1570317 auto_growth_best_fit_allocator.cc:131] Free 3211264 bytes, ptr = 0xffff81ac030c1ea0
I0605 17:05:07.889791 1570317 backward.cc:312] Node: ReluGradNode addr:0x191da280, Found pending node: BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889796 1570317 backward.cc:339] Get Edge and grad_output_tensor with slot: 0, rank: 0 's name is:
I0605 17:05:07.889798 1570317 grad_tensor_holder.h:32] Init GradTensorHolder with meta size: 6
I0605 17:05:07.889801 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889804 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889807 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889809 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889822 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889824 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.889827 1570317 backward.cc:348] Construct GradTensorHolder for grad node: BatchNormGradNode
I0605 17:05:07.889830 1570317 backward.cc:353] Sum or Move grad inputs for edge slot: 0, rank: 0
I0605 17:05:07.889834 1570317 grad_tensor_holder.cc:132] Move Tensor for buffer_ slot: 0, size: 1
I0605 17:05:07.889838 1570317 backward.cc:363] BatchNormGradNode ref_cnt is: 0
I0605 17:05:07.889843 1570317 backward.cc:243] Preparing GradNode:BatchNormGradNode addr:0x198b7070
I0605 17:05:07.889847 1570317 backward.cc:270] Run Backward Kernel with GradTensorHolder.
I0605 17:05:07.889849 1570317 nodes.cc:23093] Running AD API GRAD: batch_norm_grad
I0605 17:05:07.889856 1570317 grad_node_info.cc:43] float32 float32
I0605 17:05:07.889863 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889868 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889869 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889873 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode Conv2dGradNodeFinal addr: 0x1a366ff0
I0605 17:05:07.889878 1570317 tensor_wrapper.h:137] Recover tensor: batch_norm2d_48.w_0@Saved for wrapper
I0605 17:05:07.889880 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor 'batch_norm2d_48.w_0@Saved' is [ 0 ]
I0605 17:05:07.889883 1570317 tensor_wrapper.h:216] The tensor_version of Tensor 'batch_norm2d_48.w_0@Saved' is [ 0 ]
I0605 17:05:07.889886 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode GradNodeAccumulation addr: 0x85cdae0
I0605 17:05:07.889889 1570317 tensor_wrapper.h:137] Recover tensor: batch_norm2d_48.b_0@Saved for wrapper
I0605 17:05:07.889895 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor 'batch_norm2d_48.b_0@Saved' is [ 0 ]
I0605 17:05:07.889899 1570317 tensor_wrapper.h:216] The tensor_version of Tensor 'batch_norm2d_48.b_0@Saved' is [ 0 ]
I0605 17:05:07.889901 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode GradNodeAccumulation addr: 0x85cf000
I0605 17:05:07.889904 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889907 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889910 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889914 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889916 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889919 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889922 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889925 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889928 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889930 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889933 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889936 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889940 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889942 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889945 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889948 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889951 1570317 tensor_wrapper.h:137] Recover tensor: @Saved for wrapper
I0605 17:05:07.889953 1570317 tensor_wrapper.h:213] The wrapper_version_snapshot of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889956 1570317 tensor_wrapper.h:216] The tensor_version of Tensor '@Saved' is [ 0 ]
I0605 17:05:07.889959 1570317 tensor_wrapper.h:161] Recovered TensorWrapper with GradNode BatchNormGradNode addr: 0x198b7070
I0605 17:05:07.889963 1570317 nodes.cc:23146] Running C++ API: batch_norm_grad
I0605 17:05:07.890098 1570317 nodes.cc:23181] { Input: [
( grad_out , [{Name: None, Initialized: 1, Ptr: 0x19838cd0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]),
( x , [{Name: @Saved, Initialized: 1, Ptr: 0x1a0755d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [2]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1923ac40, ReluGradNode] }, ]SlotID: 1, StopGradients: 0, , Edges[ { [0, 0]: [0x72ed8a0, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( scale , [{Name: batch_norm2d_48.w_0@Saved, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( bias , [{Name: batch_norm2d_48.b_0@Saved, Initialized: 1, Ptr: 0x85cec70 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( out_mean , [{Name: @Saved, Initialized: 1, Ptr: 0x1a19dc90 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( out_variance , [{Name: @Saved, Initialized: 1, Ptr: 0x18621490 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_mean , [{Name: @Saved, Initialized: 1, Ptr: 0x188de050 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_variance , [{Name: @Saved, Initialized: 1, Ptr: 0x19dce0f0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( reserve_space , [{Name: @Saved, Initialized: 0, Ptr: 0x19f979d0 TensorInfo: [ Type: DenseTensor, Dtype: Unknown, Place: Unknown, Shape: Unknown ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ]}
I0605 17:05:07.890122 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890137 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890144 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890151 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890158 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890165 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890172 1570317 backward_api.cc:15475] batch_norm_grad API kernel key: [intel_gpu, NCHW, float32]
I0605 17:05:07.890183 1570317 backward_api.cc:15482] batch_norm_grad kernel: {"input":["intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32"],"output":["intel_gpu, NCHW, float32","intel_gpu, NCHW, float32","intel_gpu, NCHW, float32"],"attribute":["float","float","string","bool","bool","bool"]}
I0605 17:05:07.890195 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:07.890215 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:07.890220 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:07.890228 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:07.890240 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:07.890247 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:07.890251 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:07.890259 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:07.890261 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181febde000
I0605 17:05:07.890275 1570317 tensor_utils.cc:97] src:0x7c1a000, dst:0xffff8181febde000
I0605 17:05:07.890283 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0x7c1a000) to Place(intel_gpu:0)(0xffff8181febde000), stream=0
I0605 17:05:07.890290 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890336 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:07.890345 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:07.890355 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181febde000 src=0x7c1a000 size=2048
I0605 17:05:07.890707 1570317 runtime.cc:128] get-device() : device->id=0
I0605 17:05:07.890719 1570317 data_transform.cc:169] DeviceTransform in, src_place Place(cpu) dst_place: Place(intel_gpu:0)
I0605 17:05:07.890733 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:07.890744 1570317 tensor_utils.cc:50] TensorCopy 512 from Place(cpu) to Place(intel_gpu:0)
I0605 17:05:07.890753 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:07.890756 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:07.890764 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:07.890769 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cc000
I0605 17:05:07.890780 1570317 tensor_utils.cc:97] src:0xe650000, dst:0xffff8181ff0cc000
I0605 17:05:07.890790 1570317 memcpy.cc:66] memory::Copy 2048 Bytes from Place(cpu)(0xe650000) to Place(intel_gpu:0)(0xffff8181ff0cc000), stream=0
I0605 17:05:07.890800 1570317 runtime.cc:121] set-device : device->id=0
I0605 17:05:07.890838 1570317 context_pool.cc:62] DeviceContextPool Get: Place(intel_gpu:0)
I0605 17:05:07.890849 1570317 runtime.cc:324] sync-stream devid=0
I0605 17:05:07.890857 1570317 runtime.cc:374] memory-copy-h2d dst=0xffff8181ff0cc000 src=0xe650000 size=2048
I0605 17:05:07.891373 1570317 dense_tensor.cc:139] Allocate data with bytes: 3211264
I0605 17:05:07.891391 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 3211264 bytes, aligned to 3211264
I0605 17:05:07.891400 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 3211264 bytes from chunk size 4194304, remaining 983040
I0605 17:05:07.891409 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 3211264 bytes, ptr = 0xffff81aca4530000
I0605 17:05:07.891424 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:07.891427 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:07.891431 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:07.891434 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0cc800
I0605 17:05:07.891439 1570317 dense_tensor.cc:139] Allocate data with bytes: 2048
I0605 17:05:07.891443 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 2048 bytes, aligned to 2048
I0605 17:05:07.891446 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 2048 bytes from chunk size 2048, remaining 0
I0605 17:05:07.891449 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 2048 bytes, ptr = 0xffff8181ff0ce000
I0605 17:05:07.891551 1570317 dense_tensor.cc:139] Allocate data with bytes: 200832
I0605 17:05:07.891557 1570317 auto_growth_best_fit_allocator.cc:66] Allocate 200832 bytes, aligned to 200832
I0605 17:05:07.891562 1570317 auto_growth_best_fit_allocator.cc:76] Allocate 200832 bytes from chunk size 262144, remaining 61312
I0605 17:05:07.891569 1570317 auto_growth_best_fit_allocator.cc:123] Alloc 200832 bytes, ptr = 0xffff8181febeef80
onednn_verbose,exec,gpu:0,batch_normalization,ocl:ref:any,backward,data_f32::blocked:abcd:f0 diff_f32::blocked:abcd:f0,attr-scratchpad:user ,flags:CH,mb32ic512ih7iw7,0.158936
I0605 17:05:07.891780 1570317 auto_growth_best_fit_allocator.cc:131] Free 200832 bytes, ptr = 0xffff8181febeef80
I0605 17:05:07.891819 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0cc000
I0605 17:05:07.891827 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181febde000
I0605 17:05:07.891832 1570317 nodes.cc:23198] Fused api batch_norm_grad is called
I0605 17:05:07.891839 1570317 nodes.cc:23285] Finish AD API GRAD: batch_norm_grad
I0605 17:05:07.892006 1570317 nodes.cc:23329] { Input: [
( grad_out , [{Name: None, Initialized: 1, Ptr: 0x19838cd0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]),
( x , [{Name: @Saved, Initialized: 1, Ptr: 0x1a0755d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [2]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1923ac40, ReluGradNode] }, ]SlotID: 1, StopGradients: 0, , Edges[ { [0, 0]: [0x72ed8a0, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( scale , [{Name: batch_norm2d_48.w_0@Saved, Initialized: 1, Ptr: 0x72ed510 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( bias , [{Name: batch_norm2d_48.b_0@Saved, Initialized: 1, Ptr: 0x85cec70 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(cpu), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [1]: SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( out_mean , [{Name: @Saved, Initialized: 1, Ptr: 0x1a19dc90 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( out_variance , [{Name: @Saved, Initialized: 1, Ptr: 0x18621490 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_mean , [{Name: @Saved, Initialized: 1, Ptr: 0x188de050 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( saved_variance , [{Name: @Saved, Initialized: 1, Ptr: 0x19dce0f0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]),
( reserve_space , [{Name: @Saved, Initialized: 0, Ptr: 0x19f979d0 TensorInfo: [ Type: DenseTensor, Dtype: Unknown, Place: Unknown, Shape: Unknown ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ BackwardOutMeta: [ {SlotSize: [5]: SlotID: 0, StopGradients: 0, , Edges[ { [0, 0]: [0x1a366ff0, Conv2dGradNodeFinal] }, ]SlotID: 1, StopGradients: , Edges[ ]SlotID: 2, StopGradients: , Edges[ ]SlotID: 3, StopGradients: 0, , Edges[ { [0, 0]: [0x85cdae0, GradNodeAccumulation] }, ]SlotID: 4, StopGradients: 0, , Edges[ { [0, 0]: [0x85cf000, GradNodeAccumulation] }, ]} ], BackwardInMeta: [ {SlotSize: [SlotID: 0, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 1, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 2, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 3, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 4, StopGradients: 0, , Edges[ { NULL Edge } ]SlotID: 5, StopGradients: 0, , Edges[ { NULL Edge } ]]: ] ], StopGradient: [ 0 ] ]}]), ],
Output: [
( grad_x , [{Name: None, Initialized: 1, Ptr: 0x6f3e3a0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 32, 512, 7, 7 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]),
( grad_scale , [{Name: None, Initialized: 1, Ptr: 0x1a580460 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]),
( grad_bias , [{Name: None, Initialized: 1, Ptr: 0x19ad40d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]), ] }
I0605 17:05:07.892024 1570317 backward.cc:283] retain_graph is false, need to clear the TensorWrapper of nodes.
I0605 17:05:07.892030 1570317 auto_growth_best_fit_allocator.cc:131] Free 3211264 bytes, ptr = 0xffff81d5fcaf0000
I0605 17:05:07.892040 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0ca000
I0605 17:05:07.892047 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0ca800
I0605 17:05:07.892052 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0cd000
I0605 17:05:07.892058 1570317 auto_growth_best_fit_allocator.cc:131] Free 2048 bytes, ptr = 0xffff8181ff0cd800
I0605 17:05:07.892067 1570317 backward.cc:312] Node: BatchNormGradNode addr:0x198b7070, Found pending node: Conv2dGradNodeFinal addr: 0x1a366ff0
I0605 17:05:07.892071 1570317 backward.cc:339] Get Edge and grad_output_tensor with slot: 0, rank: 0 's name is:
I0605 17:05:07.892074 1570317 grad_tensor_holder.h:32] Init GradTensorHolder with meta size: 1
I0605 17:05:07.892076 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.892079 1570317 backward.cc:348] Construct GradTensorHolder for grad node: Conv2dGradNodeFinal
I0605 17:05:07.892082 1570317 backward.cc:353] Sum or Move grad inputs for edge slot: 0, rank: 0
I0605 17:05:07.892086 1570317 grad_tensor_holder.cc:132] Move Tensor for buffer_ slot: 0, size: 1
I0605 17:05:07.892091 1570317 backward.cc:363] Conv2dGradNodeFinal ref_cnt is: 0
I0605 17:05:07.892094 1570317 backward.cc:312] Node: BatchNormGradNode addr:0x198b7070, Found pending node: GradNodeAccumulation addr: 0x85cdae0
I0605 17:05:07.892097 1570317 backward.cc:339] Get Edge and grad_output_tensor with slot: 3, rank: 0 's name is:
I0605 17:05:07.892099 1570317 grad_tensor_holder.h:32] Init GradTensorHolder with meta size: 1
I0605 17:05:07.892102 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.892104 1570317 backward.cc:348] Construct GradTensorHolder for grad node: GradNodeAccumulation
I0605 17:05:07.892108 1570317 backward.cc:353] Sum or Move grad inputs for edge slot: 0, rank: 0
I0605 17:05:07.892112 1570317 grad_tensor_holder.cc:132] Move Tensor for buffer_ slot: 0, size: 1
I0605 17:05:07.892113 1570317 backward.cc:363] GradNodeAccumulation ref_cnt is: 0
I0605 17:05:07.892117 1570317 backward.cc:312] Node: BatchNormGradNode addr:0x198b7070, Found pending node: GradNodeAccumulation addr: 0x85cf000
I0605 17:05:07.892119 1570317 backward.cc:339] Get Edge and grad_output_tensor with slot: 4, rank: 0 's name is:
I0605 17:05:07.892122 1570317 grad_tensor_holder.h:32] Init GradTensorHolder with meta size: 1
I0605 17:05:07.892124 1570317 grad_tensor_holder.h:35] Init GradTensorHolder with meta rank: 1
I0605 17:05:07.892127 1570317 backward.cc:348] Construct GradTensorHolder for grad node: GradNodeAccumulation
I0605 17:05:07.892130 1570317 backward.cc:353] Sum or Move grad inputs for edge slot: 0, rank: 0
I0605 17:05:07.892132 1570317 grad_tensor_holder.cc:132] Move Tensor for buffer_ slot: 0, size: 1
I0605 17:05:07.892135 1570317 backward.cc:363] GradNodeAccumulation ref_cnt is: 0
I0605 17:05:07.892140 1570317 auto_growth_best_fit_allocator.cc:131] Free 3211264 bytes, ptr = 0xffff81aca4a30000
I0605 17:05:07.892148 1570317 backward.cc:243] Preparing GradNode:GradNodeAccumulation addr:0x85cf000
I0605 17:05:07.892150 1570317 backward.cc:270] Run Backward Kernel with GradTensorHolder.
I0605 17:05:07.892153 1570317 accumulation_node.cc:103] Running AD API Grad: GradNodeAccumulation
I0605 17:05:07.892158 1570317 accumulation_node.cc:40] Move Tensor ptr: 0x19ad40d0
I0605 17:05:07.892163 1570317 reducer.cc:762] Tensor[146] [batch_norm2d_48.b_0@Grad] arrived and triggered disthook
I0605 17:05:07.892166 1570317 reducer.cc:778] Tensor[146][batch_norm2d_48.b_0] is marked ready.
I0605 17:05:07.892175 1570317 accumulation_node.cc:135] Finish AD API Grad: GradNodeAccumulation
I0605 17:05:07.892191 1570317 accumulation_node.cc:148] { Input: [], Output: [(grad_out, [{Name: None, Initialized: 1, Ptr: 0x19ad40d0 TensorInfo: [ Type: DenseTensor, Dtype: float32, Place: Place(intel_gpu:0), Shape: 512 ], ADInfo:[ Grad: [ {Name: None, Initialized: 0, Ptr: 0 TensorInfo: [ Unknown ], ADInfo:[ None ]} ], GradNode: [ None ], StopGradient: [ 0 ] ]}]), ] }
I0605 17:05:07.892197 1570317 backward.cc:283] retain_graph is false, need to clear the TensorWrapper of nodes.
I0605 17:05:07.892201 1570317 accumulation_node.h:47] Do nothing here now
I0605 17:05:07.892204 1570317 backward.cc:243] Preparing GradNode:GradNodeAccumulation addr:0x85cdae0
I0605 17:05:07.892207 1570317 backward.cc:270] Run Backward Kernel with GradTensorHolder.
I0605 17:05:07.892210 1570317 accumulation_node.cc:103] Running AD API Grad: GradNodeAccumulation
I0605 17:05:07.892215 1570317 accumulation_node.cc:40] Move Tensor ptr: 0x1a580460
I0605 17:05:07.892218 1570317 reducer.cc:762] Tensor[145] [batch_norm2d_48.w_0@Grad] arrived and triggered disthook
I0605 17:05:07.892221 1570317 reducer.cc:778] Tensor[145][batch_norm2d_48.w_0] is marked ready.
I0605 17:05:07.892226 1570317 reducer.cc:906] Group[0] is ready
I0605 17:05:07.892230 1570317 reducer.cc:1045] group [0] start fused_allreduce.
I0605 17:05:07.892242 1570317 api.cc:24921] empty API kernel key: [CPU, Undefined(AnyLayout), float32]
I0605 17:05:07.892256 1570317 api.cc:24928] empty kernel: {"input":[],"output":["CPU, NCHW, float32"],"attribute":["IntArray","DataType"]}
I0605 17:05:07.892292 1570317 dense_tensor.cc:139] Allocate data with bytes: 30261152
I0605 17:05:07.892329 1570317 context_pool.cc:62] DeviceContextPool Get: Place(cpu)
I0605 17:05:07.892359 1570317 memcpy.cc:743] memory::Copy 2048 Bytes from 0xffff8181ff0cc800(Place(cpu)) to 0x1ab0f000(Place(cpu))
--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0 egr::Backward(std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&, std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&, bool)
1 egr::RunBackward(std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&, std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&, bool, bool, std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&, bool, std::vector<paddle::Tensor, std::allocator<paddle::Tensor> > const&)
2 egr::GradNodeAccumulation::operator()(paddle::small_vector<std::vector<paddle::Tensor, std::allocator<paddle::Tensor> >, 15u>&, bool, bool)
3 egr::GradNodeAccumulation::ApplyReduceHooks()
4 paddle::distributed::EagerReducer::MarkVarReady(unsigned long, bool)
5 paddle::distributed::EagerReducer::MarkGroupReady(unsigned long)
6 paddle::distributed::EagerReducer::FusedAllReduceSchedule(paddle::distributed::EagerGroup*, int)
7 paddle::distributed::EagerGroup::ConcatTensors(phi::Place const&)
8 paddle::operators::math::ConcatFunctor<phi::CPUContext, float>::operator()(phi::CPUContext const&, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> > const&, int, phi::DenseTensor*)
9 phi::funcs::ConcatFunctor<phi::CPUContext, float>::operator()(phi::CPUContext const&, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> > const&, int, phi::DenseTensor*)
10 phi::memory_utils::Copy(phi::Place const&, void*, phi::Place const&, void const*, unsigned long)
11 phi::MemoryUtils::Copy(phi::Place const&, void*, phi::Place const&, void const*, unsigned long)
12 void paddle::memory::Copy<phi::Place, phi::Place>(phi::Place, void*, phi::Place, void const*, unsigned long)
----------------------
Error Message Summary:
----------------------
FatalError: `Segmentation fault` is detected by the operating system.
[TimeInfo: *** Aborted at 1685955907 (unix time) try "date -d @1685955907" if you are using GNU date ***]
[SignalInfo: *** SIGSEGV (@0xffff8181ff0cc800) received by PID 1570317 (TID 0x7fa143155740) from PID 18446744073693612032 ***]