2 回答

TA贡献1725条经验 获得超7个赞
$ cd ~
$ more aggregate.csv
X
X
X
X
X
X
$ more ./Desktop/folder/sub-folder/sample.csv
A,1
A,2
A,3
A,4
A,5
$ more ./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
B,6
B,7
B,8
B,9
$ more ./Desktop/folder/sub-folder2/sample.csv
C,10
C,11
C,12
C,13
C,14
C,15
C,16
$ more ./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
D,17
D,18
D,19
$ python3 aggregate_samples.py ./Desktop
./Desktop/folder/sub-folder/sample.csv
./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
./Desktop/folder/sub-folder2/sample.csv
./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
$ cat aggregate.csv
X,1,6,10,17
X,2,7,11,18
X,3,8,12,19
X,4,9,13,
X,5,,14,
X,,,15,
,,,16,
这是完成此操作的代码。您需要的关键技术:os.walk()递归搜索文件夹、csv读取文件的模块sample.csv(并获取第二列)、累积样本的列表以及csv再次写出结果。我假设您的sample.csv文件将具有不同的长度,因此代码会处理它(通过预先分配一个稀疏矩阵)。
这假设您的数据集足够小以适合内存。如果没有,那么需要做更多的工作。
# aggregate_samples.py
import os
import sys
import argparse
import csv
def main(options):
columns = []
try:
# Load in aggregate.csv, if there is one.
with open('aggregate.csv') as f:
column = [line.rstrip('\n') for line in f]
columns.append(column)
except FileNotFoundError:
# Doesn't exist; create it later.
pass
longest_sample = 0
for d, subdirs, files in os.walk(options.directory):
subdirs.sort()
for filename in files:
if filename == 'sample.csv':
file_path = os.path.join(d, filename)
print(file_path)
samples = []
with open(file_path) as f:
reader = csv.reader(f, delimiter=',')
# Get the 2nd column.
for sample in reader:
samples.append(sample[1])
longest_sample = max(longest_sample, len(samples))
columns.append(samples)
# Pre-fill a transpose matrix according to number of columns
# and longest colum.
a = [ [ '' for i in columns ] for j in range(longest_sample) ]
# Move samples into matrix, transposing as you go.
for i in range(len(columns)):
for j in range(len(columns[i])):
a[j][i] = columns[i][j]
# Output matrix as CSV.
with open('aggregate.csv', 'w+') as aggregate:
writer = csv.writer(aggregate, delimiter=',')
writer.writerows(a)
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'directory',
help='Directory path.')
options = parser.parse_args()
sys.exit(main(options))
添加回答
举报