NDEVR
API Documentation
TarReader.h
1#include <zlib.h>
2#include <NDEVR/File.h>
3#include <filesystem>
4#include <fstream>
5#include <string>
6#include <vector>
7#include <cstdint>
8#include <cstring>
9#include <system_error>
10#include <chrono>
11
12namespace fs = std::filesystem;
13namespace NDEVR
14{
18 class TAR
19 {
20 public:
21 using u8 = unsigned char;
22 using u32 = uint32_t;
23 using u64 = uint64_t;
24
28 struct Reader {
29 virtual ~Reader() {}
30
37 virtual bool read_exact(u8* dst, size_t n) = 0;
38
43 virtual bool eof() const = 0;
44 };
45
49 struct FileReader final : Reader {
50 std::ifstream f;
51
56 explicit FileReader(const std::string& p) : f(p, std::ios::binary) {}
57
64 bool read_exact(u8* dst, size_t n) override {
65 f.read(reinterpret_cast<char*>(dst), std::streamsize(n));
66 return bool(f);
67 }
68
73 bool eof() const override { return f.eof(); }
74 };
75
79 struct GzReader final : Reader {
80 gzFile g = nullptr;
81
86 explicit GzReader(const std::string& p) { g = gzopen(p.c_str(), "rb"); gzbuffer(g, 1 << 20); }
87
91 ~GzReader() override { if (g) gzclose(g); }
92
99 bool read_exact(u8* dst, size_t n) override {
100 size_t got = 0;
101 while (got < n) {
102 int r = gzread(g, dst + got, unsigned(n - got));
103 if (r <= 0) return false;
104 got += size_t(r);
105 }
106 return true;
107 }
108
113 bool eof() const override { return gzeof(g) != 0; }
114 };
115
122 static inline bool starts_with(const fs::path& a, const fs::path& base) {
123 auto A = fs::weakly_canonical(a);
124 auto B = fs::weakly_canonical(base);
125 auto ar = A.begin(), br = B.begin();
126 for (; br != B.end(); ++br, ++ar) {
127 if (ar == A.end() || *ar != *br) return false;
128 }
129 return true;
130 }
131
138 static inline bool all_zero(const u8* p, size_t n) {
139 for (size_t i = 0; i < n; ++i) if (p[i] != 0) return false;
140 return true;
141 }
142
149 static inline u64 parse_octal(const char* p, size_t n) {
150 u64 v = 0;
151 size_t i = 0;
152 while (i < n && (p[i] == ' ' || p[i] == '\t' || p[i] == '\0')) ++i;
153 for (; i < n; ++i) {
154 if (p[i] < '0' || p[i] > '7') break;
155 v = (v << 3) + u64(p[i] - '0');
156 }
157 return v;
158 }
159
165 static inline u64 tar_checksum(const u8* h) {
166 u64 s = 0;
167 for (size_t i = 0; i < 512; ++i) {
168 if (i >= 148 && i < 156) s += 32;
169 else s += h[i];
170 }
171 return s;
172 }
173
177 struct TarHdr {
178 char name[100];
179 char mode[8];
180 char uid[8];
181 char gid[8];
182 char size[12];
183 char mtime[12];
184 char chksum[8];
185 char typeflag;
186 char linkname[100];
187 char magic[6];
188 char version[2];
189 char uname[32];
190 char gname[32];
191 char devmajor[8];
192 char devminor[8];
193 char prefix[155];
194 char pad[12];
195 };
196
202 static inline std::string hdr_name(const TarHdr& h) {
203 std::string n;
204 if (h.prefix[0]) {
205 n.assign(h.prefix, strnlen(h.prefix, sizeof(h.prefix)));
206 n.push_back('/');
207 n.append(h.name, strnlen(h.name, sizeof(h.name)));
208 }
209 else {
210 n.assign(h.name, strnlen(h.name, sizeof(h.name)));
211 }
212 return n;
213 }
214
220 static inline bool valid_hdr(const u8* blk) {
221 const TarHdr& h = *reinterpret_cast<const TarHdr*>(blk);
222 u64 chk = parse_octal(h.chksum, sizeof(h.chksum));
223 return chk == tar_checksum(blk);
224 }
225
231 static inline std::string read_pax_string(const std::string& rec) {
232 return rec;
233 }
234
240 static inline void apply_mode(const fs::path& p, u64 mode) {
241 std::error_code ec;
242 fs::perms perm = fs::perms::none;
243 if (mode & 0400) perm |= fs::perms::owner_read;
244 if (mode & 0200) perm |= fs::perms::owner_write;
245 if (mode & 0100) perm |= fs::perms::owner_exec;
246 if (mode & 0040) perm |= fs::perms::group_read;
247 if (mode & 0020) perm |= fs::perms::group_write;
248 if (mode & 0010) perm |= fs::perms::group_exec;
249 if (mode & 0004) perm |= fs::perms::others_read;
250 if (mode & 0002) perm |= fs::perms::others_write;
251 if (mode & 0001) perm |= fs::perms::others_exec;
252 fs::permissions(p, perm, fs::perm_options::replace, ec);
253 }
254
260 static inline void apply_mtime(const fs::path& p, u64 mtime) {
261 std::error_code ec;
262 auto tp = std::chrono::time_point<std::chrono::system_clock>(std::chrono::seconds(mtime));
263 auto ftime = fs::file_time_type::clock::now() + (tp - std::chrono::system_clock::now());
264 fs::last_write_time(p, ftime, ec);
265 }
266
273 static inline bool read_block(Reader& r, u8* buf) {
274 return r.read_exact(buf, 512);
275 }
276
284 static inline bool skip_bytes(Reader& r, u64 n, std::vector<u8>& tmp) {
285 tmp.resize(size_t(std::min<u64>(n, 1 << 20)));
286 u64 left = n;
287 while (left) {
288 size_t chunk = size_t(std::min<u64>(left, tmp.size()));
289 if (!r.read_exact(tmp.data(), chunk)) return false;
290 left -= chunk;
291 }
292 return true;
293 }
294
312 static int Extract(const File& src, const File& dest)
313 {
314 std::error_code ec;
315 fs::create_directories(dest.c_str(), ec);
316 if (ec) return -1;
317
318 std::ifstream probe(src.c_str(), std::ios::binary);
319 if (!probe) return -2;
320 unsigned char sig[2] = { 0,0 };
321 probe.read(reinterpret_cast<char*>(sig), 2);
322 probe.close();
323
324 std::unique_ptr<Reader> rd;
325 if (sig[0] == 0x1F && sig[1] == 0x8B) rd = std::make_unique<GzReader>(src.c_str());
326 else rd = std::make_unique<FileReader>(src.c_str());
327
328 if (auto* gr = dynamic_cast<GzReader*>(rd.get()); gr && !gr->g) return -3;
329 if (auto* fr = dynamic_cast<FileReader*>(rd.get()); fr && !fr->f) return -3;
330
331 fs::path base = fs::absolute(dest.c_str());
332 std::vector<u8> block(512), tmp;
333 std::string pax_path, pax_linkpath, gnu_longname, gnu_longlink;
334
335 for (;;) {
336 if (!read_block(*rd, block.data())) return -4;
337 if (all_zero(block.data(), 512)) {
338 if (!read_block(*rd, block.data())) return 0;
339 if (all_zero(block.data(), 512)) return 0;
340 }
341 if (!valid_hdr(block.data())) return -5;
342
343 const TarHdr& h = *reinterpret_cast<const TarHdr*>(block.data());
344 u64 size = parse_octal(h.size, sizeof(h.size));
345 u64 mode = parse_octal(h.mode, sizeof(h.mode));
346 u64 mtime = parse_octal(h.mtime, sizeof(h.mtime));
347 char type = h.typeflag ? h.typeflag : '0';
348
349 std::string name = hdr_name(h);
350 if (!gnu_longname.empty()) { name = gnu_longname; gnu_longname.clear(); }
351 if (!pax_path.empty()) { name = pax_path; pax_path.clear(); }
352
353 std::string linkname;
354 if (h.linkname[0]) linkname.assign(h.linkname, strnlen(h.linkname, sizeof(h.linkname)));
355 if (!gnu_longlink.empty()) { linkname = gnu_longlink; gnu_longlink.clear(); }
356 if (!pax_linkpath.empty()) { linkname = pax_linkpath; pax_linkpath.clear(); }
357
358 if (!std::strncmp(h.magic, "ustar", 5)) {}
359
360 if (type == 'x') {
361 std::string pax;
362 pax.resize(size_t(size));
363 if (size) {
364 tmp.resize(size_t(size));
365 if (!rd->read_exact(tmp.data(), size_t(size))) return -6;
366 pax.assign(reinterpret_cast<char*>(tmp.data()), pax.size());
367 }
368 u64 pad = (512 - (size % 512)) % 512;
369 if (pad && !skip_bytes(*rd, pad, tmp)) return -6;
370 size_t i = 0;
371 while (i < pax.size()) {
372 size_t j = pax.find('\n', i);
373 if (j == std::string::npos) break;
374 std::string line = pax.substr(i, j - i);
375 size_t sp = line.find(' ');
376 if (sp != std::string::npos) {
377 std::string kv = line.substr(sp + 1);
378 if (kv.rfind("path=", 0) == 0) pax_path = kv.substr(5);
379 else if (kv.rfind("linkpath=", 0) == 0) pax_linkpath = kv.substr(9);
380 }
381 i = j + 1;
382 }
383 continue;
384 }
385
386 if (type == 'g') {
387 if (size) {
388 if (!skip_bytes(*rd, size, tmp)) return -7;
389 }
390 u64 pad = (512 - (size % 512)) % 512;
391 if (pad && !skip_bytes(*rd, pad, tmp)) return -7;
392 continue;
393 }
394
395 if (type == 'L') {
396 if (size == 0) continue;
397 tmp.resize(size_t(size));
398 if (!rd->read_exact(tmp.data(), size_t(size))) return -8;
399 gnu_longname.assign(reinterpret_cast<char*>(tmp.data()), size_t(size));
400 while (!gnu_longname.empty() && (gnu_longname.back() == '\0' || gnu_longname.back() == '\n')) gnu_longname.pop_back();
401 u64 pad = (512 - (size % 512)) % 512;
402 if (pad && !skip_bytes(*rd, pad, tmp)) return -8;
403 continue;
404 }
405
406 if (type == 'K') {
407 if (size == 0) continue;
408 tmp.resize(size_t(size));
409 if (!rd->read_exact(tmp.data(), size_t(size))) return -9;
410 gnu_longlink.assign(reinterpret_cast<char*>(tmp.data()), size_t(size));
411 while (!gnu_longlink.empty() && (gnu_longlink.back() == '\0' || gnu_longlink.back() == '\n')) gnu_longlink.pop_back();
412 u64 pad = (512 - (size % 512)) % 512;
413 if (pad && !skip_bytes(*rd, pad, tmp)) return -9;
414 continue;
415 }
416
417 fs::path out = fs::absolute(base / fs::path(name).lexically_normal());
418 if (!starts_with(out, base)) {
419 u64 skip = size + ((512 - (size % 512)) % 512);
420 if (skip && !skip_bytes(*rd, skip, tmp)) return -10;
421 continue;
422 }
423
424 switch (type) {
425 case '0':
426 case '\0':
427 case '7': {
428 fs::create_directories(out.parent_path(), ec);
429 std::ofstream of(out, std::ios::binary | std::ios::trunc);
430 if (!of) return -11;
431 u64 left = size;
432 tmp.resize(size_t(std::min<u64>(left, 1 << 20)));
433 while (left) {
434 size_t chunk = size_t(std::min<u64>(left, tmp.size()));
435 if (!rd->read_exact(tmp.data(), chunk)) return -12;
436 of.write(reinterpret_cast<char*>(tmp.data()), std::streamsize(chunk));
437 if (!of) return -12;
438 left -= chunk;
439 }
440 of.flush();
441 apply_mode(out, mode & 0777);
442 apply_mtime(out, mtime);
443 u64 pad = (512 - (size % 512)) % 512;
444 if (pad && !skip_bytes(*rd, pad, tmp)) return -12;
445 } break;
446 case '5': {
447 fs::create_directories(out, ec);
448 apply_mode(out, mode & 0777);
449 u64 pad = (512 - (size % 512)) % 512;
450 if (pad && !skip_bytes(*rd, pad, tmp)) return -13;
451 } break;
452 case '2': {
453 fs::create_directories(out.parent_path(), ec);
454 std::error_code sec;
455 fs::remove(out, sec);
456 if (!linkname.empty()) {
457 std::error_code lec;
458 fs::create_symlink(linkname, out, lec);
459 }
460 u64 pad = (512 - (size % 512)) % 512;
461 if (pad && !skip_bytes(*rd, pad, tmp)) return -14;
462 } break;
463 case '1': {
464 fs::create_directories(out.parent_path(), ec);
465 if (!linkname.empty()) {
466 std::error_code hec;
467 if (fs::exists(base / linkname)) fs::create_hard_link(base / linkname, out, hec);
468 }
469 u64 pad = (512 - (size % 512)) % 512;
470 if (pad && !skip_bytes(*rd, pad, tmp)) return -15;
471 } break;
472 default: {
473 u64 pad_all = size + ((512 - (size % 512)) % 512);
474 if (pad_all && !skip_bytes(*rd, pad_all, tmp)) return -16;
475 } break;
476 }
477 }
478 }
479 };
480
481} // class ndv_tar
Logic for reading or writing to a file as well as navigating filesystems or other common file operati...
Definition File.h:53
constexpr const char * c_str() const
Used to access the raw memory of this string.
Definition String.h:423
Provides utilities for reading and extracting TAR and TAR.GZ archives.
Definition TarReader.h:19
static int Extract(const File &src, const File &dest)
Extracts all entries from a TAR or TAR.GZ archive to a destination directory.
Definition TarReader.h:312
static bool skip_bytes(Reader &r, u64 n, std::vector< u8 > &tmp)
Skips over a specified number of bytes in the reader stream.
Definition TarReader.h:284
static bool valid_hdr(const u8 *blk)
Validates a 512-byte TAR header block by verifying its checksum.
Definition TarReader.h:220
static std::string read_pax_string(const std::string &rec)
Returns a PAX extended attribute record string as-is.
Definition TarReader.h:231
static bool read_block(Reader &r, u8 *buf)
Reads a single 512-byte block from the reader.
Definition TarReader.h:273
static bool all_zero(const u8 *p, size_t n)
Checks whether all bytes in a buffer are zero.
Definition TarReader.h:138
static u64 tar_checksum(const u8 *h)
Computes the TAR checksum for a 512-byte header block.
Definition TarReader.h:165
static void apply_mtime(const fs::path &p, u64 mtime)
Sets the last modification time of a file to the given Unix timestamp.
Definition TarReader.h:260
static bool starts_with(const fs::path &a, const fs::path &base)
Checks whether path a starts with the given base path using weakly canonical forms.
Definition TarReader.h:122
static std::string hdr_name(const TarHdr &h)
Extracts the full file name from a TAR header, combining prefix and name fields.
Definition TarReader.h:202
static u64 parse_octal(const char *p, size_t n)
Parses an octal number from a TAR header field string.
Definition TarReader.h:149
static void apply_mode(const fs::path &p, u64 mode)
Applies POSIX file permissions to the given path based on a TAR mode value.
Definition TarReader.h:240
The primary namespace for the NDEVR SDK.
@ type
The type identifier string for this model node.
Definition Model.h:58
@ name
The display name of the object.
STL namespace.
Reader implementation for uncompressed files using std::ifstream.
Definition TarReader.h:49
bool eof() const override
Checks whether the end of the file has been reached.
Definition TarReader.h:73
FileReader(const std::string &p)
Constructs a FileReader and opens the specified file in binary mode.
Definition TarReader.h:56
bool read_exact(u8 *dst, size_t n) override
Reads exactly n bytes from the file into the destination buffer.
Definition TarReader.h:64
std::ifstream f
The underlying file input stream.
Definition TarReader.h:50
Reader implementation for gzip-compressed files using zlib's gzFile interface.
Definition TarReader.h:79
GzReader(const std::string &p)
Constructs a GzReader and opens the specified gzip file for reading.
Definition TarReader.h:86
bool read_exact(u8 *dst, size_t n) override
Reads exactly n bytes from the gzip stream into the destination buffer.
Definition TarReader.h:99
gzFile g
The underlying zlib gzip file handle.
Definition TarReader.h:80
bool eof() const override
Checks whether the end of the gzip stream has been reached.
Definition TarReader.h:113
~GzReader() override
Destructor that closes the gzip file handle if open.
Definition TarReader.h:91
Abstract base class for sequential byte stream reading.
Definition TarReader.h:28
virtual bool read_exact(u8 *dst, size_t n)=0
Reads exactly n bytes into the destination buffer.
virtual bool eof() const =0
Checks whether the end of the stream has been reached.
Represents the raw 512-byte POSIX TAR header structure (USTAR format).
Definition TarReader.h:177
char pad[12]
Padding to fill the 512-byte block.
Definition TarReader.h:194
char version[2]
USTAR version ("00").
Definition TarReader.h:188
char chksum[8]
Header checksum in octal ASCII.
Definition TarReader.h:184
char prefix[155]
Filename prefix for paths longer than 100 characters.
Definition TarReader.h:193
char typeflag
Entry type flag (e.g., '0' for regular file, '5' for directory).
Definition TarReader.h:185
char gid[8]
Owner group ID in octal ASCII.
Definition TarReader.h:181
char mtime[12]
Last modification time in octal ASCII (Unix epoch seconds).
Definition TarReader.h:183
char devminor[8]
Device minor number for special files.
Definition TarReader.h:192
char magic[6]
USTAR magic string ("ustar").
Definition TarReader.h:187
char uid[8]
Owner user ID in octal ASCII.
Definition TarReader.h:180
char devmajor[8]
Device major number for special files.
Definition TarReader.h:191
char linkname[100]
Name of the linked file for hard/symbolic links.
Definition TarReader.h:186
char name[100]
File name (null-terminated or full 100 chars).
Definition TarReader.h:178
char uname[32]
Owner user name.
Definition TarReader.h:189
char mode[8]
File mode in octal ASCII.
Definition TarReader.h:179
char size[12]
File size in octal ASCII.
Definition TarReader.h:182
char gname[32]
Owner group name.
Definition TarReader.h:190