Initial commit, first draft. (4c6206af) · Commits · The Heavy / imgslipstream

.idea/.gitignore

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		# Default ignored files
		/shelf/
		/workspace.xml

README.md

0 → 100644

+17 −0

Original line number	Diff line number	Diff line
		imgslipstream

		what is it

		a simple script to scan a html file and replace every img tag's src with an embedded copy of the base64-encoded data, i.e. embedding the image in the file. (back when i used to play with windows xp installers adding things to the disk such as updates used to be known as 'slipstreaming')

		why

		because imgur announced recently (apr 2023) that they would be deleting a bunch of images, and i wanted to keep the pictures with some twine games that i've been playing recently, and i also didn't want to do that by hand (i am also something of a data hoarder ;)

		how

		the python script doesn't pretend to understand html, it reads the file into memory, then looks for the <img opening tag, looks for a src= attribute following that, tries to determine if that contains a url, and if it does, downloads the url, base64 encodes the result, and jams it in to the src attribute. it also looks for <img and src=" which crop up in twine files a fair bit. it then writes the results to a new file.

		caveats

		badly formatted html won't stop it, but it also will react unpredictably. it tries to strip unnecessary whitespace from the url, but that doesn't always work right. if a quote is missing, it may miss the url entirely, or it may grab half the file and think that is the url, i capped the url length at 40 characters to try and avoid sending garbage requests. i have no idea what it does with urls that fail to load, or don't contain an image. i do not recommend removing the original file until you have thoroughly checked everything.

imgslipstream.py

0 → 100755

+139 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3

		import requests
		import argparse
		import pathlib
		import logging
		import logging.handlers
		import base64

		if __name__ == '__main__':
		log = logging.getLogger()
		log.setLevel(logging.DEBUG)
		log_file = pathlib.Path(__file__).with_suffix('.log')
		file_formatter = logging.Formatter('[%(asctime)s] %(process)d (%(levelname)s) %(message)s')
		file_handler = logging.handlers.RotatingFileHandler(log_file, delay=True, backupCount=1, maxBytes=1000000)
		file_handler.setLevel(logging.INFO)
		file_handler.setFormatter(file_formatter)
		log.addHandler(file_handler)
		debug_handler = logging.handlers.RotatingFileHandler(log_file.with_suffix('.debug'),
		delay=True, backupCount=1, maxBytes=1000000)
		debug_handler.setLevel(logging.DEBUG)
		debug_handler.setFormatter(file_formatter)
		log.addHandler(debug_handler)
		log.info('Script Start')

		p = argparse.ArgumentParser(
		description='Parses a HTML file for <img> tags, downloads the linked URLs, and embeds them into the HTML file.'
		)
		p.add_argument('source')
		p.add_argument('--verbose', '-v', action='store_true')
		args = p.parse_args()
		log.info('Arguments: %s', args)

		console_handler = logging.StreamHandler()
		if args.verbose:
		console_handler.setLevel(logging.INFO)
		else:
		console_handler.setLevel(logging.WARNING)
		console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))
		log.addHandler(console_handler)

		dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix))
		log.info('Destination: %s', dest)

		with open(args.source, 'r') as f:
		with open(dest, 'w') as g:
		line = f.read()
		if '<img' in line:
		log.debug('At least one <img found in this file')
		o = '<img'
		s = 'src="'
		s1 = 'src="'
		e = '"'
		e1 = '"'
		alt = False
		log.debug('Processing <img')
		x = line.find(o, 0)
		log.debug('x = %s', x)
		while x > -1:
		y = line.find(s, x) + len(s)
		if y == -1 + len(s):
		log.debug('s not found, looking for s1')
		y = line.find(s1, x) + len(s1)
		alt = True
		if y < x:
		raise ValueError
		log.debug('y = %s', y)
		if alt:
		z = line.find(e1, y)
		if z == -1:
		z = line.find(e, y)
		alt = False
		else:
		z = line.find(e, y)
		if z == -1:
		z = line.find(e1, y)
		log.debug('z = %s', z)
		src = line[y:z].strip()
		log.debug('src = %s', src)
		if src[:4] == 'http':
		if len(src) > 40:
		raise ValueError
		r = requests.get(src)
		log.debug('r = %s', r)
		d = base64.b64encode(r.content).decode()
		log.debug('len(d) = %s', len(d))
		t = src[src.rfind('.'):]
		log.debug('t = %s', t)
		line = line.replace(src, 'data:image/' + t + ';base64,' + d)
		log.info('Replaced %s', src)
		x = line.find(o, x + 1)
		log.debug('x = %s', x)
		if '<img' in line:
		log.debug('At least one <img found in this file')
		o = '<img'
		s = 'src="'
		s1 = 'src="'
		e = '"'
		e1 = '"'
		alt = False
		log.debug('Processing <img')
		x = line.find(o, 0)
		log.debug('x = %s', x)
		while x > -1:
		y = line.find(s, x) + len(s)
		log.debug('y = %s', y)
		if y == -1 + len(s):
		log.debug('s not found, looking for s1')
		y = line.find(s1, x) + len(s1)
		alt = True
		if y < x:
		raise ValueError
		log.debug('y = %s', y)
		if alt:
		z = line.find(e1, y)
		if z == -1:
		z = line.find(e, y)
		alt = False
		else:
		z = line.find(e, y)
		if z == -1:
		z = line.find(e1, y)
		log.debug('z = %s', z)
		src = line[y:z].strip()
		log.debug('src = %s', src)
		if src[:4] == 'http':
		if len(src) > 40:
		raise ValueError
		r = requests.get(src)
		log.debug('r = %s', r)
		d = base64.b64encode(r.content).decode()
		log.debug('len(d) = %s', len(d))
		t = src[src.rfind('.'):]
		log.debug('t = %s', t)
		line = line.replace(src, 'data:image/' + t + ';base64,' + d)
		log.info('Replaced %s', src)
		x = line.find(o, x + 1)
		log.debug('x = %s', x)
		g.write(line)

requirements.txt

0 → 100644

+2 −0

Original line number	Diff line number	Diff line
		requests~=2.29.0
		No newline at end of file