姿态估计/目标检测 — CenterNet(PyTorch)

CenterNet

该架构将对象的中心预测为热图。它将预测框的大小作为回归任务。

它也用于姿态估计:

人体关节的坐标也可以使用回归来预测。我用这个想法来预测车辆的x、y、z坐标以及偏航角、pitch_cos、pitch_sin、roll角。对于pitch,我预测sincos,因为正如我们看到的,这个角度既可以接近0,也可以接近3.14。这7个参数是我的回归目标变量,而不是shift_x、shift_y、size_x、size_y

加载数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
from tqdm import tqdm#_notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
import os
from scipy.optimize import minimize
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from torchvision import transforms, utils

PATH = '../input/pku-autonomous-driving/'
os.listdir(PATH)

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'sample_submission.csv')

# From camera.zip
camera_matrix = np.array([[2304.5479, 0, 1686.2379],
[0, 2305.8757, 1354.9849],
[0, 0, 1]], dtype=np.float32)
camera_matrix_inv = np.linalg.inv(camera_matrix)

train.head()

# ImageId PredictionString
# 0 ID_8a6e65317 16 0.254839 -2.57534 -3.10256 7.96539 3.20066 ...
# 1 ID_337ddc495 66 0.163988 0.192169 -3.12112 -3.17424 6.55331...
# 2 ID_a381bf4d0 43 0.162877 0.00519276 -3.02676 2.1876 3.53427...
# 3 ID_7c4a3e0aa 43 0.126957 -3.04442 -3.10883 -14.738 24.6389 ...
# 4 ID_8b510fad6 37 0.16017 0.00862796 -3.0887 -3.04548 3.4977 ...

ImageId列包含图像名称:

1
2
3
4
5
6
7
8
9
10
11
def imread(path, fast_mode=False):
img = cv2.imread(path)
if not fast_mode and img is not None and len(img.shape) == 3:
img = np.array(img[:, :, ::-1])
return img

img = imread(PATH + 'train_images/ID_8a6e65317' + '.jpg')
IMG_SHAPE = img.shape

plt.figure(figsize=(15,8))
plt.imshow(img)

PredictionString列包含所有汽车的姿态信息。从数据描述来看:主要数据是汽车图像和相关姿态信息。姿态信息被格式化为字符串,如下:model type, yaw, pitch, roll, x, y, z。照片中有两辆车的具体示例:5 0.5 0.5 0.5 0.0 0.0 0.0 32 0.25 0.25 0.25 0.5 0.4 0.7,我们需要一个函数来提取这些值:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def str2coords(s, names=['id', 'yaw', 'pitch', 'roll', 'x', 'y', 'z']):
'''
Input:
s: PredictionString (e.g. from train dataframe)
names: array of what to extract from the string
Output:
list of dicts with keys from `names`
'''
coords = []
for l in np.array(s.split()).reshape([-1, 7]):
coords.append(dict(zip(names, l.astype('float'))))
if 'id' in coords[-1]:
coords[-1]['id'] = int(coords[-1]['id'])
return coords

print('Output:\n', str2coords(inp))

# Output:
# [{'id': 16, 'yaw': 0.254839, 'pitch': -2.57534, 'roll': -3.10256, 'x': 7.96539, 'y': 3.20066, 'z': 11.0225}, {'id': 56, 'yaw': 0.181647, 'pitch': -1.46947, 'roll': -3.12159, 'x': 9.60332, 'y': 4.66632, 'z': 19.339}, {'id': 70, 'yaw': 0.163072, 'pitch': -1.56865, 'roll': -3.11754, 'x': 10.39, 'y': 11.2219, 'z': 59.7825}, {'id': 70, 'yaw': 0.141942, 'pitch': -3.1395, 'roll': 3.11969, 'x': -9.59236, 'y': 5.13662, 'z': 24.7337}, {'id': 46, 'yaw': 0.163068, 'pitch': -2.08578, 'roll': -3.11754, 'x': 9.83335, 'y': 13.2689, 'z': 72.9323}]

数据分布

1
2
3
4
5
lens = [len(str2coords(s)) for s in train['PredictionString']]

plt.figure(figsize=(15,6))
sns.countplot(lens);
plt.xlabel('Number of cars in image')

所有点的数据框:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
points_df = pd.DataFrame()
for col in ['x', 'y', 'z', 'yaw', 'pitch', 'roll']:
arr = []
for ps in train['PredictionString']:
coords = str2coords(ps)
arr += [c[col] for c in coords]
points_df[col] = arr

print('len(points_df)', len(points_df))
points_df.head()

# x y z yaw pitch roll
# 0 7.96539 3.20066 11.0225 0.254839 -2.57534 -3.10256
# 1 9.60332 4.66632 19.3390 0.181647 -1.46947 -3.12159
# 2 10.39000 11.22190 59.7825 0.163072 -1.56865 -3.11754
# 3 -9.59236 5.13662 24.7337 0.141942 -3.13950 3.11969
# 4 9.83335 13.26890 72.9323 0.163068 -2.08578 -3.11754

plt.figure(figsize=(15,6))
sns.distplot(points_df['x'], bins=500);
plt.xlabel('x')
plt.show()
1
2
3
4
plt.figure(figsize=(15,6))
sns.distplot(points_df['x'], bins=500);
plt.xlabel('x')
plt.show()
1
2
3
4
plt.figure(figsize=(15,6))
sns.distplot(points_df['z'], bins=500);
plt.xlabel('z')
plt.show()
1
2
3
4
plt.figure(figsize=(15,6))
sns.distplot(points_df['yaw'], bins=500);
plt.xlabel('yaw')
plt.show()
1
2
3
4
plt.figure(figsize=(15,6))
sns.distplot(points_df['pitch'], bins=500);
plt.xlabel('pitch')
plt.show()

我猜想,在这个数据集中,俯仰和偏航是混合在一起的。间距不可能那么大。这意味着汽车是颠倒的。

1
2
3
4
5
6
7
8
9
def rotate(x, angle):
x = x + angle
x = x - (x + np.pi) // (2 * np.pi) * 2 * np.pi
return x

plt.figure(figsize=(15,6))
sns.distplot(points_df['roll'].map(lambda x: rotate(x, np.pi)), bins=500);
plt.xlabel('roll rotated by pi')
plt.show()

2D可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def get_img_coords(s):
'''
Input is a PredictionString (e.g. from train dataframe)
Output is two arrays:
xs: x coordinates in the image (row)
ys: y coordinates in the image (column)
'''
coords = str2coords(s)
xs = [c['x'] for c in coords]
ys = [c['y'] for c in coords]
zs = [c['z'] for c in coords]
P = np.array(list(zip(xs, ys, zs))).T
img_p = np.dot(camera_matrix, P).T
img_p[:, 0] /= img_p[:, 2]
img_p[:, 1] /= img_p[:, 2]
img_xs = img_p[:, 0]
img_ys = img_p[:, 1]
img_zs = img_p[:, 2] # z = Distance from the camera
return img_xs, img_ys

plt.figure(figsize=(14,14))
plt.imshow(imread(PATH + 'train_images/' + train['ImageId'][2217] + '.jpg'))
plt.scatter(*get_img_coords(train['PredictionString'][2217]), color='red', s=100);

我们来看看所有点的分布。图片仅供参考。

1
2
3
4
5
6
7
8
9
10
xs, ys = [], []

for ps in train['PredictionString']:
x, y = get_img_coords(ps)
xs += list(x)
ys += list(y)

plt.figure(figsize=(18,18))
plt.imshow(imread(PATH + 'train_images/' + train['ImageId'][2217] + '.jpg'), alpha=0.3)
plt.scatter(xs, ys, color='red', s=10, alpha=0.2);

很多点都在外面!让我们“从空中”看一下这个分布:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Road points
road_width = 3
road_xs = [-road_width, road_width, road_width, -road_width, -road_width]
road_ys = [0, 0, 500, 500, 0]

plt.figure(figsize=(16,16))
plt.axes().set_aspect(1)
plt.xlim(-50,50)
plt.ylim(0,100)

# View road
plt.fill(road_xs, road_ys, alpha=0.2, color='gray')
plt.plot([road_width/2,road_width/2], [0,100], alpha=0.4, linewidth=4, color='white', ls='--')
plt.plot([-road_width/2,-road_width/2], [0,100], alpha=0.4, linewidth=4, color='white', ls='--')
# View cars
plt.scatter(points_df['x'], np.sqrt(points_df['z']**2 + points_df['y']**2), color='red', s=10, alpha=0.1);

点的3d分布:

1
2
fig = px.scatter_3d(points_df, x='x', y='y', z='z',color='pitch', range_x=(-50,50), range_y=(0,50), range_z=(0,250), opacity=0.1)
fig.show()

x是从左到右测量的,我认为y是距汽车的距离,z是路面上方的高度。让我们看看这些点在一个平面上的分布情况如何,尝试在已知x、z的情况下预测 y

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
zy_slope = LinearRegression()
X = points_df[['z']]
y = points_df['y']
zy_slope.fit(X, y)
print('MAE without x:', mean_absolute_error(y, zy_slope.predict(X)))

# Will use this model later
xzy_slope = LinearRegression()
X = points_df[['x', 'z']]
y = points_df['y']
xzy_slope.fit(X, y)
print('MAE with x:', mean_absolute_error(y, xzy_slope.predict(X)))
print('\ndy/dx = {:.3f}\ndy/dz = {:.3f}'.format(*xzy_slope.coef_))

# MAE without x: 0.6553782352986744
# MAE with x: 0.4481661289004074
# dy/dx = 0.051
# dy/dz = 0.169

plt.figure(figsize=(16,16))
plt.xlim(0,500)
plt.ylim(0,100)
plt.scatter(points_df['z'], points_df['y'], label='Real points')
X_line = np.linspace(0,500, 10)
plt.plot(X_line, zy_slope.predict(X_line.reshape(-1, 1)), color='orange', label='Regression')
plt.legend()
plt.xlabel('z coordinate')
plt.ylabel('y coordinate');

3D可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from math import sin, cos

# convert euler angle to rotation matrix
def euler_to_Rot(yaw, pitch, roll):
Y = np.array([[cos(yaw), 0, sin(yaw)],[0, 1, 0],[-sin(yaw), 0, cos(yaw)]])
P = np.array([[1, 0, 0],[0, cos(pitch), -sin(pitch)],[0, sin(pitch), cos(pitch)]])
R = np.array([[cos(roll), -sin(roll), 0],[sin(roll), cos(roll), 0],[0, 0, 1]])
return np.dot(Y, np.dot(P, R))

def visualize(img, coords):
# You will also need functions from the previous cells
x_l = 1.02
y_l = 0.80
z_l = 2.31

img = img.copy()
for point in coords:
# Get values
x, y, z = point['x'], point['y'], point['z']
yaw, pitch, roll = -point['pitch'], -point['yaw'], -point['roll']
# Math
Rt = np.eye(4)
t = np.array([x, y, z])
Rt[:3, 3] = t
Rt[:3, :3] = euler_to_Rot(yaw, pitch, roll).T
Rt = Rt[:3, :]
P = np.array([[x_l, -y_l, -z_l, 1],
[x_l, -y_l, z_l, 1],
[-x_l, -y_l, z_l, 1],
[-x_l, -y_l, -z_l, 1],
[0, 0, 0, 1]]).T
img_cor_points = np.dot(camera_matrix, np.dot(Rt, P))
img_cor_points = img_cor_points.T
img_cor_points[:, 0] /= img_cor_points[:, 2]
img_cor_points[:, 1] /= img_cor_points[:, 2]
img_cor_points = img_cor_points.astype(int)
# Drawing
img = draw_line(img, img_cor_points)
img = draw_points(img, img_cor_points[-1:])

return img

def draw_line(image, points):
color = (255, 0, 0)
cv2.line(image, tuple(points[0][:2]), tuple(points[3][:2]), color, 16)
cv2.line(image, tuple(points[0][:2]), tuple(points[1][:2]), color, 16)
cv2.line(image, tuple(points[1][:2]), tuple(points[2][:2]), color, 16)
cv2.line(image, tuple(points[2][:2]), tuple(points[3][:2]), color, 16)
return image


def draw_points(image, points):
for (p_x, p_y, p_z) in points:
cv2.circle(image, (p_x, p_y), int(1000 / p_z), (0, 255, 0), -1)

return image

n_rows = 6

for idx in range(n_rows):
fig, axes = plt.subplots(1, 2, figsize=(20,20))
img = imread(PATH + 'train_images/' + train['ImageId'].iloc[idx] + '.jpg')
axes[0].imshow(img)
img_vis = visualize(img, str2coords(train['PredictionString'].iloc[idx]))
axes[1].imshow(img_vis)
plt.show()

图像预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
IMG_WIDTH = 1024
IMG_HEIGHT = IMG_WIDTH // 16 * 5
MODEL_SCALE = 8

def _regr_preprocess(regr_dict, flip=False):
if flip:
for k in ['x', 'pitch', 'roll']:
regr_dict[k] = -regr_dict[k]
for name in ['x', 'y', 'z']:
regr_dict[name] = regr_dict[name] / 100
regr_dict['roll'] = rotate(regr_dict['roll'], np.pi)
regr_dict['pitch_sin'] = sin(regr_dict['pitch'])
regr_dict['pitch_cos'] = cos(regr_dict['pitch'])
regr_dict.pop('pitch')
regr_dict.pop('id')
return regr_dict

def _regr_back(regr_dict):
for name in ['x', 'y', 'z']:
regr_dict[name] = regr_dict[name] * 100
regr_dict['roll'] = rotate(regr_dict['roll'], -np.pi)

pitch_sin = regr_dict['pitch_sin'] / np.sqrt(regr_dict['pitch_sin']**2 + regr_dict['pitch_cos']**2)
pitch_cos = regr_dict['pitch_cos'] / np.sqrt(regr_dict['pitch_sin']**2 + regr_dict['pitch_cos']**2)
regr_dict['pitch'] = np.arccos(pitch_cos) * np.sign(pitch_sin)
return regr_dict

def preprocess_image(img, flip=False):
img = img[img.shape[0] // 2:]
bg = np.ones_like(img) * img.mean(1, keepdims=True).astype(img.dtype)
bg = bg[:, :img.shape[1] // 6]
img = np.concatenate([bg, img, bg], 1)
img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
if flip:
img = img[:,::-1]
return (img / 255).astype('float32')

def get_mask_and_regr(img, labels, flip=False):
mask = np.zeros([IMG_HEIGHT // MODEL_SCALE, IMG_WIDTH // MODEL_SCALE], dtype='float32')
regr_names = ['x', 'y', 'z', 'yaw', 'pitch', 'roll']
regr = np.zeros([IMG_HEIGHT // MODEL_SCALE, IMG_WIDTH // MODEL_SCALE, 7], dtype='float32')
coords = str2coords(labels)
xs, ys = get_img_coords(labels)
for x, y, regr_dict in zip(xs, ys, coords):
x, y = y, x
x = (x - img.shape[0] // 2) * IMG_HEIGHT / (img.shape[0] // 2) / MODEL_SCALE
x = np.round(x).astype('int')
y = (y + img.shape[1] // 6) * IMG_WIDTH / (img.shape[1] * 4/3) / MODEL_SCALE
y = np.round(y).astype('int')
if x >= 0 and x < IMG_HEIGHT // MODEL_SCALE and y >= 0 and y < IMG_WIDTH // MODEL_SCALE:
mask[x, y] = 1
regr_dict = _regr_preprocess(regr_dict, flip)
regr[x, y] = [regr_dict[n] for n in sorted(regr_dict)]
if flip:
mask = np.array(mask[:,::-1])
regr = np.array(regr[:,::-1])
return mask, regr

img0 = imread(PATH + 'train_images/' + train['ImageId'][0] + '.jpg')
img = preprocess_image(img0)

mask, regr = get_mask_and_regr(img0, train['PredictionString'][0])

print('img.shape', img.shape, 'std:', np.std(img))
print('mask.shape', mask.shape, 'std:', np.std(mask))
print('regr.shape', regr.shape, 'std:', np.std(regr))

plt.figure(figsize=(16,16))
plt.title('Processed image')
plt.imshow(img)
plt.show()

plt.figure(figsize=(16,16))
plt.title('Detection Mask')
plt.imshow(mask)
plt.show()

plt.figure(figsize=(16,16))
plt.title('Yaw values')
plt.imshow(regr[:,:,-2])
plt.show()

# img.shape (320, 1024, 3) std: 0.29819912
# mask.shape (40, 128) std: 0.031234737
# regr.shape (40, 128, 7) std: 0.013211883

定义从2D地图转换回3D坐标和角度的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
DISTANCE_THRESH_CLEAR = 2

def convert_3d_to_2d(x, y, z, fx = 2304.5479, fy = 2305.8757, cx = 1686.2379, cy = 1354.9849):
return x * fx / z + cx, y * fy / z + cy

def optimize_xy(r, c, x0, y0, z0, flipped=False):
def distance_fn(xyz):
x, y, z = xyz
xx = -x if flipped else x
slope_err = (xzy_slope.predict([[xx,z]])[0] - y)**2
x, y = convert_3d_to_2d(x, y, z)
y, x = x, y
x = (x - IMG_SHAPE[0] // 2) * IMG_HEIGHT / (IMG_SHAPE[0] // 2) / MODEL_SCALE
y = (y + IMG_SHAPE[1] // 6) * IMG_WIDTH / (IMG_SHAPE[1] * 4 / 3) / MODEL_SCALE
return max(0.2, (x-r)**2 + (y-c)**2) + max(0.4, slope_err)

res = minimize(distance_fn, [x0, y0, z0], method='Powell')
x_new, y_new, z_new = res.x
return x_new, y_new, z_new

def clear_duplicates(coords):
for c1 in coords:
xyz1 = np.array([c1['x'], c1['y'], c1['z']])
for c2 in coords:
xyz2 = np.array([c2['x'], c2['y'], c2['z']])
distance = np.sqrt(((xyz1 - xyz2)**2).sum())
if distance < DISTANCE_THRESH_CLEAR:
if c1['confidence'] < c2['confidence']:
c1['confidence'] = -1
return [c for c in coords if c['confidence'] > 0]

def extract_coords(prediction, flipped=False):
logits = prediction[0]
regr_output = prediction[1:]
points = np.argwhere(logits > 0)
col_names = sorted(['x', 'y', 'z', 'yaw', 'pitch_sin', 'pitch_cos', 'roll'])
coords = []
for r, c in points:
regr_dict = dict(zip(col_names, regr_output[:, r, c]))
coords.append(_regr_back(regr_dict))
coords[-1]['confidence'] = 1 / (1 + np.exp(-logits[r, c]))
coords[-1]['x'], coords[-1]['y'], coords[-1]['z'] = \
optimize_xy(r, c,
coords[-1]['x'],
coords[-1]['y'],
coords[-1]['z'], flipped)
coords = clear_duplicates(coords)
return coords

def coords2str(coords, names=['yaw', 'pitch', 'roll', 'x', 'y', 'z', 'confidence']):
s = []
for c in coords:
for n in names:
s.append(str(c.get(n, 0)))
return ' '.join(s)

确保所有前向和后向转换一致地工作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
for idx in range(2):
fig, axes = plt.subplots(1, 2, figsize=(20,20))

for ax_i in range(2):
img0 = imread(PATH + 'train_images/' + train['ImageId'].iloc[idx] + '.jpg')
if ax_i == 1:
img0 = img0[:,::-1]
img = preprocess_image(img0, ax_i==1)
mask, regr = get_mask_and_regr(img0, train['PredictionString'][idx], ax_i==1)
regr = np.rollaxis(regr, 2, 0)
coords = extract_coords(np.concatenate([mask[None], regr], 0), ax_i==1)

axes[ax_i].set_title('Flip = {}'.format(ax_i==1))
axes[ax_i].imshow(visualize(img0, coords))
plt.show()

PyTorch 数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class CarDataset(Dataset):
"""Car dataset."""

def __init__(self, dataframe, root_dir, training=True, transform=None):
self.df = dataframe
self.root_dir = root_dir
self.transform = transform
self.training = training

def __len__(self):
return len(self.df)

def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()

# Get image name
idx, labels = self.df.values[idx]
img_name = self.root_dir.format(idx)

# Augmentation
flip = False
if self.training:
flip = np.random.randint(10) == 1

# Read image
img0 = imread(img_name, True)
img = preprocess_image(img0, flip=flip)
img = np.rollaxis(img, 2, 0)

# Get mask and regression maps
mask, regr = get_mask_and_regr(img0, labels, flip=flip)
regr = np.rollaxis(regr, 2, 0)

return [img, mask, regr]

train_images_dir = PATH + 'train_images/{}.jpg'
test_images_dir = PATH + 'test_images/{}.jpg'

df_train, df_dev = train_test_split(train, test_size=0.01, random_state=42)
df_test = test

# Create dataset objects
train_dataset = CarDataset(df_train, train_images_dir, training=True)
dev_dataset = CarDataset(df_dev, train_images_dir, training=False)
test_dataset = CarDataset(df_test, test_images_dir, training=False)

img, mask, regr = train_dataset[0]

plt.figure(figsize=(16,16))
plt.imshow(np.rollaxis(img, 0, 3))
plt.show()

plt.figure(figsize=(16,16))
plt.imshow(mask)
plt.show()

plt.figure(figsize=(16,16))
plt.imshow(regr[-2])
plt.show()
1
2
3
4
5
6
7
BATCH_SIZE = 4

# Create data generators - they will produce batches
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
dev_loader = DataLoader(dataset=dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

PyTorch Model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from efficientnet_pytorch import EfficientNet

class double_conv(nn.Module):
'''(conv => BN => ReLU) * 2'''
def __init__(self, in_ch, out_ch):
super(double_conv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)

def forward(self, x):
x = self.conv(x)
return x

class up(nn.Module):
def __init__(self, in_ch, out_ch, bilinear=True):
super(up, self).__init__()

# would be a nice idea if the upsampling could be learned too,
# but my machine do not have enough memory to handle all those weights
if bilinear:
self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
else:
self.up = nn.ConvTranspose2d(in_ch//2, in_ch//2, 2, stride=2)

self.conv = double_conv(in_ch, out_ch)

def forward(self, x1, x2=None):
x1 = self.up(x1)

# input is CHW
diffY = x2.size()[2] - x1.size()[2]
diffX = x2.size()[3] - x1.size()[3]

x1 = F.pad(x1, (diffX // 2, diffX - diffX//2,
diffY // 2, diffY - diffY//2))

# for padding issues, see
# https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
# https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd

if x2 is not None:
x = torch.cat([x2, x1], dim=1)
else:
x = x1
x = self.conv(x)
return x

def get_mesh(batch_size, shape_x, shape_y):
mg_x, mg_y = np.meshgrid(np.linspace(0, 1, shape_y), np.linspace(0, 1, shape_x))
mg_x = np.tile(mg_x[None, None, :, :], [batch_size, 1, 1, 1]).astype('float32')
mg_y = np.tile(mg_y[None, None, :, :], [batch_size, 1, 1, 1]).astype('float32')
mesh = torch.cat([torch.tensor(mg_x).to(device), torch.tensor(mg_y).to(device)], 1)
return mesh

class MyUNet(nn.Module):
'''Mixture of previous classes'''
def __init__(self, n_classes):
super(MyUNet, self).__init__()
self.base_model = EfficientNet.from_pretrained('efficientnet-b0')

self.conv0 = double_conv(5, 64)
self.conv1 = double_conv(64, 128)
self.conv2 = double_conv(128, 512)
self.conv3 = double_conv(512, 1024)

self.mp = nn.MaxPool2d(2)

self.up1 = up(1282 + 1024, 512)
self.up2 = up(512 + 512, 256)
self.outc = nn.Conv2d(256, n_classes, 1)

def forward(self, x):
batch_size = x.shape[0]
mesh1 = get_mesh(batch_size, x.shape[2], x.shape[3])
x0 = torch.cat([x, mesh1], 1)
x1 = self.mp(self.conv0(x0))
x2 = self.mp(self.conv1(x1))
x3 = self.mp(self.conv2(x2))
x4 = self.mp(self.conv3(x3))

x_center = x[:, :, :, IMG_WIDTH // 8: -IMG_WIDTH // 8]
feats = self.base_model.extract_features(x_center)
bg = torch.zeros([feats.shape[0], feats.shape[1], feats.shape[2], feats.shape[3] // 8]).to(device)
feats = torch.cat([bg, feats, bg], 3)

# Add positional info
mesh2 = get_mesh(batch_size, feats.shape[2], feats.shape[3])
feats = torch.cat([feats, mesh2], 1)

x = self.up1(feats, x4)
x = self.up2(x, x3)
x = self.outc(x)
return x

# Gets the GPU if there is one, otherwise the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

n_epochs = 10

model = MyUNet(8).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=max(n_epochs, 10) * len(train_loader) // 3, gamma=0.1)

训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def criterion(prediction, mask, regr, size_average=True):
# Binary mask loss
pred_mask = torch.sigmoid(prediction[:, 0])
# mask_loss = mask * (1 - pred_mask)**2 * torch.log(pred_mask + 1e-12) + (1 - mask) * pred_mask**2 * torch.log(1 - pred_mask + 1e-12)
mask_loss = mask * torch.log(pred_mask + 1e-12) + (1 - mask) * torch.log(1 - pred_mask + 1e-12)
mask_loss = -mask_loss.mean(0).sum()

# Regression L1 loss
pred_regr = prediction[:, 1:]
regr_loss = (torch.abs(pred_regr - regr).sum(1) * mask).sum(1).sum(1) / mask.sum(1).sum(1)
regr_loss = regr_loss.mean(0)

# Sum
loss = mask_loss + regr_loss
if not size_average:
loss *= prediction.shape[0]
return loss

def train_model(epoch, history=None):
model.train()

for batch_idx, (img_batch, mask_batch, regr_batch) in enumerate(tqdm(train_loader)):
img_batch = img_batch.to(device)
mask_batch = mask_batch.to(device)
regr_batch = regr_batch.to(device)

optimizer.zero_grad()
output = model(img_batch)
loss = criterion(output, mask_batch, regr_batch)
if history is not None:
history.loc[epoch + batch_idx / len(train_loader), 'train_loss'] = loss.data.cpu().numpy()

loss.backward()

optimizer.step()
exp_lr_scheduler.step()

print('Train Epoch: {} \tLR: {:.6f}\tLoss: {:.6f}'.format(
epoch,
optimizer.state_dict()['param_groups'][0]['lr'],
loss.data))

def evaluate_model(epoch, history=None):
model.eval()
loss = 0

with torch.no_grad():
for img_batch, mask_batch, regr_batch in dev_loader:
img_batch = img_batch.to(device)
mask_batch = mask_batch.to(device)
regr_batch = regr_batch.to(device)

output = model(img_batch)

loss += criterion(output, mask_batch, regr_batch, size_average=False).data

loss /= len(dev_loader.dataset)

if history is not None:
history.loc[epoch, 'dev_loss'] = loss.cpu().numpy()

print('Dev loss: {:.4f}'.format(loss))


history = pd.DataFrame()
for epoch in range(n_epochs):
torch.cuda.empty_cache()
train_model(epoch, history)
evaluate_model(epoch, history)

torch.save(model.state_dict(), './model.pth')
history['train_loss'].iloc[100:].plot()

series = history.dropna()['dev_loss']
plt.scatter(series.index, series)

可视化预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
img, mask, regr = dev_dataset[0]

plt.figure(figsize=(16,16))
plt.title('Input image')
plt.imshow(np.rollaxis(img, 0, 3))
plt.show()

plt.figure(figsize=(16,16))
plt.title('Ground truth mask')
plt.imshow(mask)
plt.show()

output = model(torch.tensor(img[None]).to(device))
logits = output[0,0].data.cpu().numpy()

plt.figure(figsize=(16,16))
plt.title('Model predictions')
plt.imshow(logits)
plt.show()

plt.figure(figsize=(16,16))
plt.title('Model predictions thresholded')
plt.imshow(logits > 0)
plt.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
torch.cuda.empty_cache()
gc.collect()

for idx in range(8):
img, mask, regr = dev_dataset[idx]

output = model(torch.tensor(img[None]).to(device)).data.cpu().numpy()
coords_pred = extract_coords(output[0])
coords_true = extract_coords(np.concatenate([mask[None], regr], 0))

img = imread(train_images_dir.format(df_dev['ImageId'].iloc[idx]))

fig, axes = plt.subplots(1, 2, figsize=(30,30))
axes[0].set_title('Ground truth')
axes[0].imshow(visualize(img, coords_true))
axes[1].set_title('Prediction')
axes[1].imshow(visualize(img, coords_pred))
plt.show()

提交

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
model.eval()
predictions = []
test_loader = DataLoader(dataset=test_dataset, batch_size=4, shuffle=False, num_workers=4)

for img, _, _ in tqdm(test_loader):
with torch.no_grad():
output = model(img.to(device))
output = output.data.cpu().numpy()
for out in output:
coords = extract_coords(out)
s = coords2str(coords)
predictions.append(s)

test = pd.read_csv(PATH + 'sample_submission.csv')
test['PredictionString'] = predictions
test.to_csv('predictions.csv', index=False)
test.head()

结果输出为:

1
2
3
4
5
6
	ImageId	PredictionString
0 ID_18fb86d04 0.14394063 0.021658449556661755 -3.07686785061...
1 ID_ce06d4dc1 0.15193772 0.027584688562316634 -3.08121419523...
2 ID_afe07078f 0.1380412 -3.120722459673391 -3.11551375326747...
3 ID_37caf75aa 0.15178935 0.011924028786011396 -3.10641848494...
4 ID_7655363ca 0.13959816 -3.1307816952173844 -3.117394277947...