labelimg yolo 数据集生成及划分

# 图像与标注文件分离

本步骤是针对使用 labelimg 时，默认图像文件和标注文件存放到同一目录，因此需要分开处理，最终结果为图像在 images文件夹下，标注文件在 labels文件夹下。如果指定的是不同目录，可以省略此步骤

separate.py

# 数据集图像与标注文件分离脚本

import os
import shutil

import argparse

# 创建 ArgumentParser 对象
parser = argparse.ArgumentParser(description='数据集划分工具')

# 添加参数
parser.add_argument('--dir', help='目录位置', required=True)

# 解析命令行参数
args = parser.parse_args()

# 当前文件夹路径
folder_path = args.dir

# print(folder_path)

# 删除已存在的 images 文件夹和 labels 文件夹及其内容
shutil.rmtree(os.path.join(folder_path, 'images'), ignore_errors=True)
shutil.rmtree(os.path.join(folder_path, 'labels'), ignore_errors=True)

# 创建 images 文件夹和 labels 文件夹
os.makedirs(os.path.join(folder_path, 'images'))
os.makedirs(os.path.join(folder_path, 'labels'))

# 递归查找当前文件夹下的所有文件
for root, dirs, files in os.walk(folder_path):
        # 排除 label 和 images 文件夹
    if 'labels' in dirs:
        dirs.remove('labels')
    if 'images' in dirs:
        dirs.remove('images')
    print(root, dirs, files)
    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith('.jpg'):
            # 复制 jpg 文件到 images 文件夹下（copy2 同名强制覆盖）
            shutil.copy(file_path, os.path.join(folder_path, 'images', file))
        elif file.endswith('.txt'):
            # 复制 txt 文件到 labels 文件夹下（copy2 同名强制覆盖）
            shutil.copy(file_path, os.path.join(folder_path, 'labels', file))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

运行脚本

python separate.py --dir /home/hualiujie/baoxinshagnchuan/ultralytics-main-cgh/datasets/TVDSDataSet

# 划分数据集

将数据集以 7:2:1的比例划分为训练集，验证集、测试集

divide_dataset.py

# 划分数据集（7:2:1）

import os
import numpy as np
import math
import shutil
import tkinter as tk
from tkinter import filedialog

# 使用时请选择数据集根目录，根目录必须包含images与labels两个子目录
# images（原图）与labels（标签）对应
# train：训练集（训练模型）
# val：验证集（选择模型）
# test：测试集（评估模型）

# 遍历images、labels目录中的文件树
# 得到单个文件对象
# 利用随机数将文件名打乱放入数组对象中
# 对数组进行切割，分成三份元组，按照比例 train(7),val(2),test(1)
# 遍历单个元组，按照元组中的文本查找文件并分类

# walk 函数，一个目录，返回三个元组，第一个是目录路径，第二个参数是子目录树，第三个参数是该目录下的文件树


# 数据提取函数，将文件夹中的名称提取出来
# 解释下面代码含义
def data_extraction(paths):
    file_name_list = np.array([])
    for dirs, subdir, fileList in os.walk(paths):
        for file in fileList:
            # 文件名与文件后辍分割
            file_name = os.path.splitext(file)[0]
            # 将文件名放入数组中
            if file_name != 'classes':
                file_name_list = np.append(file_name_list, file_name)
    return file_name_list


# 分配资源（随机）
def allocation(file_list: np.array([]), percentage: list):
    # 随机为 train 分配资源
    train_num = math.floor(np.size(file_list) * percentage[0])
    train_list = np.random.choice(file_list, train_num, replace=False)

    # 得到剩余资源
    leftover_list = np.setdiff1d(file_list, train_list)

    # 随机为 val 分配资源
    val_num = math.floor(np.size(file_list) * percentage[1])
    val_list = np.random.choice(leftover_list, val_num, replace=False)

    # test 拾取剩下资源
    test_list = np.setdiff1d(leftover_list, val_list)
    return np.array([train_list, val_list, test_list], dtype=object)


# 检测文件夹是否存在
def dir_make(path):
    if not os.path.exists(path):
        os.makedirs(path)


# 根据文件名移动文件
def file_move(path, path2, __name__):
    # print(path,path2,__name__)
    # 遍历老目录
    for root, dirs, files in os.walk(path):
        # 查找文件
        for filename in files:
            # print(filename,name)
            # print(name in filename)
            if name in filename:
                f1 = os.path.join(path, filename)
                f2 = os.path.join(path2, filename)
                shutil.copy(f1, f2)
                print(f1 + ' 已经被复制划分到 ' + f2)
                break
        break


# 主函数
if __name__ == '__main__':

    import argparse
    # 创建 ArgumentParser 对象
    parser = argparse.ArgumentParser(description='数据集划分脚本工具')
    # 添加参数
    parser.add_argument('--dir', help='目录位置', required=True)

    # 解析命令行参数
    args = parser.parse_args()

    PATH = args.dir

    # 图片路径
    ImgPath = os.path.join(PATH, 'images')
    # 标注路径
    LabelPath = os.path.join(PATH, 'labels')
    # 训练占比，分别对应 train、val，分配前两个参数时，请不要超过0.9，因为test会自动得到剩下的图片
    PERCENTAGE = [0.7, 0.2]
    print(ImgPath,LabelPath)
    # 检测 train 文件夹
    ImgTrainPath = os.path.join(ImgPath, 'train')
    dir_make(ImgTrainPath)
    LabTrainPath = os.path.join(LabelPath, 'train')
    dir_make(LabTrainPath)

    # 检测 val 文件夹
    ImgValPath = os.path.join(ImgPath, 'val')
    dir_make(ImgValPath)
    LabValPath = os.path.join(LabelPath, 'val')
    dir_make(LabValPath)

    # 检测 test 文件夹
    ImgTestPath = os.path.join(ImgPath, 'test')
    dir_make(ImgTestPath)
    LabTestPath = os.path.join(LabelPath, 'test')
    dir_make(LabTestPath)

    # 获得资源分配
    res = allocation(data_extraction(ImgPath), PERCENTAGE)
    # 遍历三个数组
    for res_item in res:
        # 遍历当前数组
        for name in res_item:
            # train 分类
            if name in res[0]:
                file_move(ImgPath, ImgTrainPath, name)
                file_move(LabelPath, LabTrainPath, name)
            # val 分类
            elif name in res[1]:
                file_move(ImgPath, ImgValPath, name)
                file_move(LabelPath, LabValPath, name)
            # test 分类
            else:
                file_move(ImgPath, ImgTestPath, name)
                file_move(LabelPath, LabTestPath, name)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

运行脚本

python divide_dataset.py --dir /home/hualiujie/baoxinshagnchuan/ultralytics-main-cgh/datasets/TVDSDataSet

编辑

上次更新: 2023/10/14, 10:01:13

← YOLOv8 断点续训『LabelImg』使用小技巧→