本文主要记录了 GitHub 上的热门 Repo AlexeyAB/Darknet 的一些使用要点。

与 OpenCV 的速度比较

使用 GPU 时此 Repo 速度比 OpenCV 快
使用 CPU 时此 Repo 速度比 OpenCV 慢
来源

Finetune 相关

记在 base 数据集上训练得到的模型为 yolov3-old.weights, 当有新增数据集时，

若新增数据集和 base 数据
集类别一致，则在 yolov3-old.weights 的基础上，用 base+新增数据进行训练。

若新增数据集包含其他类别，则先用 darknet.exe partial cfg/yolov3.cfg yolov3.weights yolov3.conv.81 81 得到 yolov3.conv.81 模型，再在此模型上用新数据集进行训练。来源 partial命令

处理逻辑

以使用单个 GPU 进行处理为例:

训练

假设 cfg 文件中定义的 batch=64, subdivisions=16

<1> 解析各种配置文件，如 coco.data, coco.names 等，获取各种参数。
<2> 解析 cfg 文件并将其实例化为 network net 对象. (注意，此过程中 net.batch 参数不是 cfg 文件中的 batch 值，而是 cfg 文件中 batch/subdivisions 得到的值。net.batch 的值为真正进行前向传播时的 batch size)
<3> 加载预训练模型 - weights 文件到 net 对象中。
<4> 获取所有训练图像的路径。
<5> 创建一个线程用来从磁盘中 load 数据，每次从磁盘中 load cfg 文件中的 batch 张图像到内存。
<6> 迭代训练。
<6.1> 将一次从磁盘中读取的 batch (cfg 文件中) 张图像分成 subdivisions 份，即每份为 net.batch。使用 net.batch 张图像进行一次迭代，同时返回一个 batch 的 loss，最后，对 subdivisions 个 batch 的 loss 进行加和平均，得到一次从磁盘读取的所有图像 (cfg 中 batch 张) 的平均 loss.
<6.2> 不断重复步骤 6.1，在某些迭代次数时生成模型以及计算 mAP.

测试

由于 AlexeyAB 没有提供批量测试函数 (Batch Inference), 因此我自己实现了此功能。

// network.c 中添加以下函数
float **network_predict_image_batch_gpu(float *imgBatch, network* net, float thresh, float hier_thresh, float nms, metadata meta, int* box_nums) {
	int w = network_width(net);
	int h = network_height(net);
	int batch = net->batch;
	int c = 3;

	// predict batch images
	network_predict(*net, imgBatch);
	free(imgBatch);

	float **results = (float**)calloc(batch, sizeof(float*));

	for (int i = 0; i < batch; i++) {
		int nboxes = 0;
		int letterbox = 0;
		detection * dets = get_network_boxes(net, w, h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
		do_nms_sort(dets, nboxes, meta.classes, nms);

		int real_box_num = 0;

		for (int j = 0; j < nboxes; j++) {
			for (int s = 0; s < meta.classes; s++) {
				if (dets[j].prob[s] > 0) {
					real_box_num += 1;
				}
			}
		}

		memcpy(box_nums + i, &real_box_num, sizeof(int));
		float *res = (float*)calloc(real_box_num * 6, sizeof(float)); // 6 is {x, y, w, h, prob, class}

		for (int j = 0; j < nboxes; j++) {
			for (int s = 0; s < meta.classes; s++) {
				float now_prob = dets[j].prob[s];
				if (now_prob > 0) {
					box b = dets[j].bbox;
					char * nameTag = meta.names[s];
					float x_ctr = b.x;
					float y_ctr = b.y;
					float width = b.w;
					float height = b.h;
					float cls_idx = (float)s;

					memcpy(res, &x_ctr, 1 * sizeof(float));
					res += 1;
					memcpy(res, &y_ctr, 1 * sizeof(float));
					res += 1;
					memcpy(res, &width, 1 * sizeof(float));
					res += 1;
					memcpy(res, &height, 1 * sizeof(float));
					res += 1;
					memcpy(res, &now_prob, 1 * sizeof(float));
					res += 1;
					memcpy(res, &cls_idx, 1 * sizeof(float));
					res += 1;
				}
			}
		}

		res -= real_box_num * 6;
		results[i] = res;

		free_detections(dets, nboxes);

		for (int k = 0; k < net->n; k++) {
			layer temp_layer = net->layers[k];
			if (temp_layer.type == YOLO || temp_layer.type == REGION || temp_layer.type == DETECTION) {
				net->layers[k].output = net->layers[k].output + net->layers[k].outputs;
				//temp_layer.output = temp_layer.output + temp_layer.outputs; 原来的版本
			}
		}

	}

	return results;
}

// 如果要将 darknet 编译成 dll 供其他程序使用，则在 darknet.h 里 network_predict_image 附近 加上
LIB_API float **network_predict_image_batch_gpu(float *imgBatch, network* net, float thresh, float hier_thresh, float nms, metadata meta, int* box_nums)；

调用此函数的示例代码

int main() {
	char* configPath = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/cfg/yolov3-mlit-SD.cfg";
	char* weightPath = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/backup/yolov3-mlit-SD_50000.weights";
	char* metaPath = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/cfg/mlit.data";
	string result_dir = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/result/";

	char* img_path_1 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_1.jpg";
	char* img_path_2 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_2.jpg";
	char* img_path_3 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_3.jpg";
	char* img_path_4 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_4.jpg";
	char* img_path_5 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_5.jpg";
	char* img_path_6 = "C:/Users/taoxuan.G08/Documents/Visual Studio 2015/Projects/mlit_yolo/mlit_yolo/test_images/test_6.jpg";

	int batchSize = 6;

	char **img_paths = (char**)calloc(batchSize, sizeof(char*));
	img_paths[0] = img_path_1;
	img_paths[1] = img_path_2;
	img_paths[2] = img_path_3;
	img_paths[3] = img_path_4;
	img_paths[4] = img_path_5;
	img_paths[5] = img_path_6;

	float conf_thresh = 0.6;
	float hier_thresh = 0.5;
	float nms = 0.45;

	cuda_set_device(0);
	network* netMain = load_network_custom(configPath, weightPath, 0, batchSize);
	metadata metaMain = get_metadata(metaPath);

	int input_w = network_width(netMain);
	int input_h = network_height(netMain);
	int c = 3;
	//int num_calsses = metaMain.classes;

	// 这里得到的 batch 就是上面手动设置的 batchSize
	int batch = netMain->batch;

	// 从 char** 中读取图像数据，并合并成 float *
	float *imgBatch = (float*)calloc(batch * input_w * input_h * c, sizeof(float));

	//No OpenCV
	for (int i = 0; i < batch; i++) {
		image dark_image = load_image_color(img_paths[i], 0, 0);
		image resized = resize_image(dark_image, input_w, input_h);
		memcpy(imgBatch + i*input_w*input_h*c, resized.data, input_w*input_h*c * sizeof(float));
	}


	int *box_num_batch = (int*)calloc(batch, sizeof(int));
	float **results = network_predict_image_batch_gpu(imgBatch, netMain, conf_thresh, hier_thresh, nms, metaMain, box_num_batch);
	float *res;

	// 解析 results
	for (int i = 0; i < batch; i++) {
		cv::Mat image2show = cv::imread(img_paths[i]);
		int ori_w = image2show.cols;
		int ori_h = image2show.rows;

		int nbox = *(box_num_batch + i);
		res = results[i];

		std::cout << std::endl << nbox << " boxes detected" << std::endl;
		std::cout << std::endl;

		for (int j = 0; j < nbox; j++) {
			float x_ctr = res[0 + 6 * j];
			float y_ctr = res[1 + 6 * j];
			float width = res[2 + 6 * j];
			float height = res[3 + 6 * j];
			float prob = res[4 + 6 * j];
			float cls_idx = res[5 + 6 * j];
			char * nameTag = metaMain.names[(int)(cls_idx)];

			int w_on_ori = (int)(width * ori_w);
			int h_on_ori = (int)(height * ori_h);
			int lft = (int)(x_ctr * ori_w - w_on_ori / 2);
			int rgt = (int)(x_ctr * ori_w + w_on_ori / 2);
			int top = (int)(y_ctr * ori_h - h_on_ori / 2);
			int bot = (int)(y_ctr * ori_h + h_on_ori / 2);

			cv::Point pt1(lft, top);
			cv::Point pt2(rgt, bot);
			cv::rectangle(image2show, pt1, pt2, Scalar(0, 255, 0), 1);
			std::string text = std::string(nameTag) + " [" + to_string(int(round(prob * 100))) + "]";
			cv::putText(image2show, text, Point(pt1.x, pt1.y - 5), cv::FONT_HERSHEY_COMPLEX, 0.5, Scalar(0, 255, 0), 2);

			printf(">>> %d %d %d %d %f <<< ", (int)(x_ctr * ori_w), (int)(y_ctr * ori_h), (int)(width * ori_w), (int)(height * ori_h), prob);
			std::cout << nameTag << std::endl;

		}

		cv::imshow("detected", image2show);
		cv::waitKey(0);
		//free(res);
		//free_detections(res, nbox);
	}

	free(box_num_batch);
	free(results);

	system("pause");

	return 0;
}

主存与显存

在此 repo 的实现中，数据是先从磁盘读取到主存中，然后在使用 GPU 进行训练前，将主存中的数据拷贝至显存对象中，然后使用 GPU 进行运算。
参考 network_kernels.cu 中的 float *network_predict_gpu(network net, float *input) 函数。

在 Traffic counter 项目中，将来可能使用 GPU 版本的解码器，此解码器解码后的图像数据是在显存中的。因此，设想在将来的处理中，将略过从主存往显存拷贝数据这一步骤，直接传递显存中的对象，并进行处理。具体实现时，可重点参考 float *network_predict_gpu(network net, float *input) 函数。

与 OpenCV 的速度比较

Finetune 相关

处理逻辑

训练

测试

主存与显存

Confidence threshold

NMS threshold