2 回答
TA贡献1860条经验 获得超8个赞
我的总体方法是合理的,但由于缺乏从 Java 的 ZipFile 返回的详细信息而受到阻碍。例如,有时在下一个本地标头开始之前,压缩数据的末尾有一个额外的 16 个字节。ZipFile 中没有任何内容可以帮助解决此问题。
zip4j 似乎是一个更好的选择,并提供了以下方法: header.getOffsetLocalHeader()
消除了一些容易出错的计算。
TA贡献1785条经验 获得超8个赞
我还可以使用 zip4j 通过以下代码使其工作。
但是我仍然不明白通过等式的解码部分: long endFile = 30 + offset + header.getFileNameLength() + compressedSize - 1;。30从哪里来?我如何确保该方程包含所有用例的所有必要变量?
public static void main(String[] args) throws Exception {
S3Client s3Client = S3Client.builder()
.credentialsProvider(StaticCredentialsProvider
.create(AwsSessionCredentials.create(ACCESS_KEY, SECRET_KEY, SESSION_TOKEN)))
.region(Region.US_WEST_2)
.build();
HeadObjectResponse headObject = s3Client.headObject(HeadObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.build());
long zipSize = headObject.contentLength();
// fetch the last 22 bytes (end-of-central-directory record; assuming the comment field is empty)
long eocdStart = zipSize - 22;
final var eocdStream = s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(eocdStart, zipSize))
.build());
System.out.println("eocd start: " + eocdStart);
byte[] eocd = IOUtils.toByteArray(eocdStream);
// get the start offset and size of the central directory
int cdSize = byteArrayToLeInt(Arrays.copyOfRange(eocd, 12, 16));
int cdStart = byteArrayToLeInt(Arrays.copyOfRange(eocd, 16, 20));
System.out.println("cdStart: " + cdStart);
System.out.println("cdSize: " + cdSize);
// get the full central directory
final var cdStream = s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(cdStart, cdStart + cdSize - 1))
.build());
byte[] cd = IOUtils.toByteArray(cdStream);
// write the full dir + eocd:
ByteArrayOutputStream out = new ByteArrayOutputStream();
// write cd
out.write(cd);
// write eocd, resetting the cd start to 0 since that is
// where it will appear in our new temp file
byte[] b = leIntToByteArray(0);
eocd[16] = b[0];
eocd[17] = b[1];
eocd[18] = b[2];
eocd[19] = b[3];
out.write(eocd);
out.flush();
byte[] cdbytes = out.toByteArray();
System.out.println(cdbytes.length);
File tempFile = Files.createTempFile("temp", "zip").toFile();
FileOutputStream output = new FileOutputStream(tempFile);
output.write(cdbytes);
output.flush();
output.close();
getZipFile1(s3Client, tempFile, "a2ed09e5-dfdb-4a66-95f5-8bb62bc8fafd-2023-05-23T10_07_19Z.warc.gz");
getZipFile1(s3Client, tempFile, "index.cdx.gz");
getZipFile1(s3Client, tempFile, "index.cdx");
getZipFile1(s3Client, tempFile, "extraPages.jsonl");
getZipFile1(s3Client, tempFile, "pages.jsonl");
getZipFile1(s3Client, tempFile, "datapackage.json");
getZipFile1(s3Client, tempFile, "datapackage-digest.json");
}
private static void getZipFile1(S3Client s3Client, File tempFile, String file) throws Exception {
ZipFile zipFile = new ZipFile(tempFile);
for (var header : zipFile.getFileHeaders()) {
if (!header.isDirectory()) {
if (header.getFileName().contains(file)) {
System.out.println(header);
long offset = header.getOffsetLocalHeader(); // 41489906
int compressedSize = (int) header.getCompressedSize(); // 171
long endFile = 30 + offset + header.getFileNameLength() + compressedSize - 1;
byte[] fileBytes = IOUtils.toByteArray(s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(offset, endFile))
.build()));
ZipInputStream zipInputStream = new ZipInputStream(new ByteArrayInputStream(fileBytes));
zipInputStream.getNextEntry(header, true);
File outputFile = new File("/home/joao/Downloads/folder/" + header.getFileName());
Files.deleteIfExists(outputFile.toPath());
FileUtils.copyInputStreamToFile(zipInputStream, outputFile);
}
}
}
}
添加回答
举报