1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
| import random as rand rand.seed(0) class RatingData(object): ''' movielens的rating.dat的数据集处理类 '''
TRAIN_SET_KEY = 'trainset' TEST_SET_KEY = 'testset'
@staticmethod def __loadFile(filepath, logger): ''' 加载数据文件,返回生成器。 Args: filepath:数据文件的路径 logger:日志记录对象 Returns: 数据文件中,每次按序返回一行数据的迭代器 ''' with open(filepath, 'r', encoding='UTF-8') as fp: for i, line in enumerate(fp): yield line.strip('\r\n') if i % 100000 == 0: logger.info('加载数据文件生成器 {filepath}({rowNum})'.format(filepath=filepath, rowNum=i)) logger.info('加载数据文件 {filepath} 成功'.format(filepath=filepath))
@staticmethod def getRatingDict(logger, filepath, proportion=0.7, transport=False): ''' 加载评分数据字典对象,划分为训练集和测试集 Args: logger:日志记录对象 filepath:数据文件的路径 proportion:数据集训练集的划分比例,剩余的作为测试集 transport:是否将数据集转置,未转置是user-item,转置则是item-user Returns: 包含trainset和testset的字典对象 ''' trainset = {} trainsetSize = 0 testset = {} testsetSize = 0
for line in RatingData.__loadFile(filepath, logger): if transport: dimA = '电影' item, user, rating, _ = line.split('::') else: dimA = '用户' user, item, rating, _ = line.split('::') user = int(user) item = int(item) rating = float(rating) if rand.random() < proportion: trainset.setdefault(user, {}) trainset[user][item] = rating trainsetSize += 1 else: testset.setdefault(user, {}) testset[user][item] = rating testsetSize += 1
logger.info('划分训练集和测试集成功!') logger.info('训练集{dimA}数量 {item}'.format(dimA=dimA, item = len(trainset))) logger.info('训练集大小 {size}'.format(size=trainsetSize)) logger.info('测试集{dimA}数量 {item}'.format(dimA=dimA, item = len(testset))) logger.info('测试集大小 {size}'.format(size=testsetSize))
return {RatingData.TRAIN_SET_KEY:trainset, RatingData.TEST_SET_KEY:testset}
@staticmethod def getSmallRatingData(logger, filepath, proportion=0.7, ratio=0.3, transport=False): ''' 加载评分数据字典对象,划分为训练集和测试集 Args: logger:日志记录对象 filepath:数据文件的路径 proportion:数据集训练集的划分比例,剩余的作为测试集 ratio:缩小为原数据集的比例数 transport:是否将数据集转置,未转置是user-item,转置则是item-user Returns: 包含trainset和testset的字典对象 ''' dataset = RatingData.getRatingDict(logger, filepath, proportion, transport) trainset = dataset[RatingData.TRAIN_SET_KEY] testset = dataset[RatingData.TEST_SET_KEY] userset = set(trainset.keys()) smallUserset = set() for user in userset: if rand.random() < ratio: smallUserset.add(user) itemset = set() for user in userset: itemset |= set(trainset[user].keys()) ratio *= len(itemset) itemset.clear() for user in smallUserset: itemset |= set(trainset[user].keys()) ratio /= len(itemset) smallItemset = set() for item in itemset: if rand.random() < ratio: smallItemset.add(item) smallTrainset = {} for user in smallUserset: smallTrainset[user] = {} for item in trainset[user].keys(): if item not in smallItemset: continue smallTrainset[user][item] = trainset[user][item] smallTestset = {} for user in smallUserset: smallTestset[user] = {} if testset.get(user) == None: continue for item in testset[user].keys(): if item not in smallItemset: continue smallTestset[user][item] = testset[user][item]
if transport: dimA = '电影' dimB = '用户' else: dimA = '用户' dimB = '电影' logger.info('原数据集[{dimA}*{dimB}]大小:[{lenA}*{lenB}]'.format(dimA=dimA, dimB=dimB, lenA=len(userset), lenB=len(itemset))) logger.info('收缩数据集[{dimA}*{dimB}]大小:[{lenA}*{lenB}]'.format(dimA=dimA, dimB=dimB, lenA=len(smallUserset), lenB=len(smallItemset)))
return {RatingData.TRAIN_SET_KEY:smallTrainset, RatingData.TEST_SET_KEY:smallTestset}
|