Initial commit.

2022-07-29 17:40:16 +00:00 · 2022-07-29 17:40:16 +00:00 · d1d157c074
commit d1d157c074
5 changed files with 180 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+*.img
+*.dat
+*.dhd
+*.rhd
+pool
--- a/README.md
+++ b/README.md
@ -0,0 +1,51 @@
+# <center>Dehydrate-fs (Beta)</center>
+`dehydrate-fs` is a family of tools for separating out files from disk images for the efficient storage of both. The project currently exists as a minimum viable product supporting only the `ext2/3/4` filesystems.
+
+## Quickstart
+```bash
+#Generate the filesystem map
+map "$file" mapfile.dat
+
+#Dehydrate & compress the filesystem
+dehydrate "$file" mapfile.dat | zip -1 "$file".dhd.zip -
+
+#Rehydrate the filesystem
+funzip "$file".dhd.zip | rehydrate mapfile.dat "$file".rhd
+
+#Compare results
+cmp "$file" "$file".rhd
+```
+
+## Installation
+The scripts may be ran directly. Please ensure you have `perl` and `e2fsprogs` available.
+
+## Usage
+### `map FILE [MAPFILE]`
+Create a mapping of files in the partition image and extract their contents. If `MAPFILE` is not specified, the output is written to `STDOUT`. Files are placed in `./pool/` and are named with their `sha256sum`.
+
+`map` accepts an environment variable `THRESHOLD` for minimum filesize bytes. It defaults to `1048576`.
+
+### `dehydrate FILE MAPFILE [OUTPUT]`
+Create a copy of `FILE` with zeros written to the locations specified by `MAPFILE`. If `OUTPUT` is not specified, the output is written to `STDOUT`. To prevent terminal corruption, the program will not run if `STDOUT` is a terminal.
+
+It is recommended that you stream the output into a compressed archive as the dehydrated file is the same size as the input. `zip` is recommended, but `xz` performs similarly enough. `gzip` does not appear to be appropriate unless higher-quality compression is desired.
+
+```bash
+dehydrate "$file" "$mapfile" | zip -1 "$file".dhd.zip -
+```
+
+### `rehydrate MAPFILE [OUTPUT]`
+Read from `STDIN`, replacing specified subsections with file data according to `MAPFILE`. `rehydrate` requires that the file contents are available under `./pool/`. If `OUTPUT` is not specified, the output is written to `STDOUT`. To prevent terminal corruption, the program will not run if `STDOUT` is a terminal.
+
+## FAQ:
+#### Why is this necessary when chunk-based deduplicating storage systems exist?
+To my knowledge, most chunk-based deduplicating storage systems operate at a very coarse level that isn't suitable for collections of average-sized or fragmented files.
+
+#### What is the danger of dataloss?
+The tools are written entirely in bash with very little mind paid to error handling. It is assumed the user will verify data integrity before committing irreversible actions. That said, the pseudo-formats are developed specifically to be as simple as possible. Dehydrated `ext2/3/4` filesystem images are mountable using native tools and the mapfile format is trivial to parse.
+
+#### Why the hell is it programmed entirely in bash?!
+Because I could.
+
+#### No seriously, why?
+I am not a clever man. Even toy programs for interacting with `ext2/3/4` make my head swim. Too many details, not enough visible intent. I prefer shell scripting for this reason.
--- a/dehydrate.sh
+++ b/dehydrate.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+image="$1"; [ -z "$1" ] && { echo "No image file specified!" >&2; exit 1; }
+mfile="$2"; [ -z "$2" ] && { echo "No mapfile specified!"    >&2; exit 1; }
+[ -z "$3" -a -t 1 ] && { echo "The output of this script is unsafe for raw display." >&2
+                         exit 1; } \
+                    || ofile=${3:-/dev/stdout}
+
+BLOCKSIZE=`sed -nE 's/^#BLOCKSIZE=([0-9]+)$/\1/p' "$mfile"`
+[ -z "$BLOCKSIZE" ] && { echo "Mapfile does not specify blocksize!" >&2; exit 1; }
+
+shopt -s expand_aliases
+alias dd="dd bs=$BLOCKSIZE status=none"
+
+{
+i=0
+while read -r -d$'\n' _ block lengt _; do
+	#Leaves the remainder block for convenience
+	dd if="$image"  skip=$i count=$((block-i))
+	dd if=/dev/zero count=$((lengt-1))
+	i=$((block+lengt-1))
+done < <(grep -v ^\# "$mfile" | sed -En '/[a-f0-9]{64}/!p' | sort -unk2)
+dd if="$image" skip=$i
+} > "$ofile"
--- a/map.sh
+++ b/map.sh
@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -eo pipefail
+image="$1"; [ -z "$1" ] && { echo "No image file specified." >&2; exit 1; }
+
+inspect () {
+	(	#Jerry-rig error handling onto debugfs
+		{ debugfs -R "$*" "$image" 1>&3; } 2>&1 | tail -n +2 |\
+		{ if IFS= read -r line; then
+			printf '%s\n' "$line"
+			cat; exit 1
+		  fi; } >&2
+	) 3>&1 | cat #Never invoke pager.
+}
+BLOCKSIZE=`inspect stats | sed -nE 's/^Block size:\s+([0-9]+)$/\1/p'`
+
+#Prefer same filesystem for efficiency on large files
+TMPDIR="$(mktemp -d "`pwd`/.pool.XXXXXXXX")" || "$(mktemp -d)" || exit 1
+trap 'rm -rf -- "$TMPDIR"' EXIT
+
+size () {
+	inspect stat "$1" \
+		| head -n3 \
+		| sed -nE 's/.*Size: ([0-9]+).*/\1/p'
+}
+
+extents () {
+	remainder=$((`size $1`%BLOCKSIZE))
+	inspect dump_extents "$1" \
+		| tail -n +2 \
+		| awk '{print $5, $8, $11}' \
+		| sed '$s/$/ '$remainder'/'
+}
+
+THRESHOLD=${THRESHOLD:-$((1024*1024))}
+listall () {
+	if [ $# -eq 0 ]; then
+		listall "`inspect ls -p /`"
+		wait; return
+	fi
+
+	perl -p -e 's!/\n$!/\x0!g' <<< "$1" \
+	| while IFS=/ read -r -d $'\0' _ inode itype _ _ name isize; do
+		case ${itype:0:3} in
+		'100') [ $isize -ge $THRESHOLD ] && echo $inode $isize;;
+		'040') [[ "${name}" != @(.|..) ]] && {
+				echo "Recursing into $name" >&2;
+				listall "`inspect ls -p "<$inode>"`" & };;
+		esac
+	done
+}
+
+mkdir -p pool
+{
+echo -e "#BLOCKSIZE=$BLOCKSIZE\n#INODE SHA256SUM\n#INDEX BLOCK LENGTH [REMAINDER]"
+listall \
+| while IFS=' ' read -r -d $'\n' inode isize; do
+	#TODO: Make asynchronus
+	tmp="$(mktemp -p "$TMPDIR")"
+	sha=`inspect dump "<$inode>" /dev/stdout \
+		| tee "$tmp" \
+		| sha256sum \
+		| cut -d' ' -f1`
+	[ -f pool/$sha ] \
+		&& rm -f "$tmp" \
+		|| mv -v "$tmp" pool/$sha 1>&2
+
+	echo $inode $sha
+	extents "<$inode>" 
+done
+} > ${2:-/dev/stdout}
--- a/rehydrate.sh
+++ b/rehydrate.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+[ -t 0 ] && { echo "$0 must be run as a filter." >&2; exit 1; }
+
+mfile="$1"; [ -z "$1" ] && { echo "No image file specified!" >&2; exit 1; }
+
+BLOCKSIZE=`sed -nE 's/^#BLOCKSIZE=([0-9]+)$/\1/p' "$mfile"`
+[ -z "$BLOCKSIZE" ] && { echo "Mapfile does not specify blocksize!" >&2; exit 1; }
+
+map="$(grep -v ^\# "$mfile" \
+	| awk '/[a-f0-9]{64}/{curfile=$2;next}{print curfile, $0}' \
+	| sort -unk 3)"
+nitems=`wc -l <<<"$map"` 
+
+shopt -s expand_aliases
+alias dd="dd bs=$BLOCKSIZE status=none"
+
+{
+pos=0
+for (( i = 0; i < $nitems; i++ )) do
+	read hash lblock block lengt _ < <(sed $((i+1))'q;d' <<<"$map")
+	dd if=/dev/stdin count=$((block-pos))
+	dd if=/dev/stdin count=$((lengt-1)) >/dev/null
+	dd if=pool/$hash count=$((lengt-1)) skip=$lblock
+	pos=$((block+lengt-1))
+done
+dd if=/dev/stdin
+} > "${2:-/dev/stdout}"