ReLaX-VQA / src /data_processing /recover_median_train_test.py
Xinyi Wang
first commit
211b431
import pandas as pd
from scipy.io import loadmat, savemat
import numpy as np
def split_features(data_name, df, result_file, feature_path, layer_name):
data = loadmat(result_file)
all_variables = {}
for key, value in data.items():
# ignore '__'
if not key.startswith('__') and not key.endswith('__'):
all_variables[key] = value
if data_name == 'konvid_1k':
for i in range(len(all_variables['Test_videos_Median_model'])):
test_vids = all_variables['Test_videos_Median_model'][i]
test_vids = test_vids.tolist()
else:
test_vids = []
for i in range(len(all_variables['Test_videos_Median_model'])):
vid = all_variables['Test_videos_Median_model'][i].strip()
test_vids.append(vid)
# drop greyscale videos
if data_name == 'youtube_ugc':
grey_df = pd.read_csv(f'{metadata_path}/greyscale_report/{data_name.upper()}_greyscale_metadata.csv')
grey_indices = grey_df.iloc[:, 0].tolist()
df = df.drop(index=grey_indices).reset_index(drop=True)
all_vids = df.iloc[:, 0].tolist()
print(all_vids)
print(test_vids)
train_vids = list(set(all_vids) - set(test_vids))
print(len(test_vids))
print(len(train_vids))
# split all_data into train and test based on vids
train_df = df[df.iloc[:, 0].isin(train_vids)]
test_df = df[df.iloc[:, 0].isin(test_vids)]
print(len(test_df))
# reorder columns
sorted_train_df = pd.DataFrame({'vid': train_df.iloc[:, 0], 'framerate': train_df['framerate'], 'MOS': train_df['mos']})
sorted_test_df = pd.DataFrame({'vid': test_df.iloc[:, 0], 'framerate': test_df['framerate'], 'MOS': test_df['mos']})
# use indices from the train and test DataFrames to split features
data = loadmat(f'{feature_path}{layer_name}/original_features/{network_name}_{data_name}_original_features.mat')
features = data[f'{data_name}']
if data_name == 'youtube_ugc':
features = np.delete(features, grey_indices, axis=0)
train_features = features[train_df.index]
test_features = features[test_df.index]
# recover the median train and test files
sorted_train_df.to_csv(f'{metadata_path}mos_files/{data_name}_MOS_train.csv', index=False)
sorted_test_df.to_csv(f'{metadata_path}mos_files/{data_name}_MOS_test.csv', index=False)
savemat(f'{feature_path}{layer_name}/relaxvqa_{data_name}_original_train_features.mat', {f'{data_name}_train_features': train_features})
savemat(f'{feature_path}{layer_name}/relaxvqa_{data_name}_original_test_features.mat', {f'{data_name}_test_features': test_features})
return train_features, test_features, test_vids
if __name__ == '__main__':
metadata_path = '../../metadata/'
feature_path = '../../features_merged_frag/'
result_path = f'../../log/result/'
data_name = 'cvd_2014'
network_name = 'relaxvqa'
layer_name = 'pool'
model_name = 'Mlp'
select_criteria = 'byrmse'
df = pd.read_csv(f'{metadata_path}/{data_name.upper()}_metadata.csv')
result_file = f'{result_path}{data_name}_{network_name}_{select_criteria}.mat'
train_features, test_features, test_vids = split_features(data_name, df, result_file, feature_path, layer_name)