Commit 4c6206af authored by The Heavy's avatar The Heavy 🚂
Browse files

Initial commit, first draft.

parents
Loading
Loading
Loading
Loading
Loading

.idea/.gitignore

0 → 100644
+3 −0
Original line number Diff line number Diff line
# Default ignored files
/shelf/
/workspace.xml

README.md

0 → 100644
+17 −0
Original line number Diff line number Diff line
imgslipstream

what is it

a simple script to scan a html file and replace every img tag's src with an embedded copy of the base64-encoded data, i.e. embedding the image in the file. (back when i used to play with windows xp installers adding things to the disk such as updates used to be known as 'slipstreaming')

why

because imgur announced recently (apr 2023) that they would be deleting a bunch of images, and i wanted to keep the pictures with some twine games that i've been playing recently, and i also didn't want to do that by hand (i am also something of a data hoarder ;)

how

the python script doesn't pretend to understand html, it reads the file into memory, then looks for the <img opening tag, looks for a src= attribute following that, tries to determine if that contains a url, and if it does, downloads the url, base64 encodes the result, and jams it in to the src attribute. it also looks for &lt;img and src=&quot; which crop up in twine files a fair bit. it then writes the results to a new file.

caveats

badly formatted html won't stop it, but it also will react unpredictably. it tries to strip unnecessary whitespace from the url, but that doesn't always work right. if a quote is missing, it may miss the url entirely, or it may grab half the file and think that is the url, i capped the url length at 40 characters to try and avoid sending garbage requests. i have no idea what it does with urls that fail to load, or don't contain an image. i do not recommend removing the original file until you have thoroughly checked everything.

imgslipstream.py

0 → 100755
+139 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

import requests
import argparse
import pathlib
import logging
import logging.handlers
import base64

if __name__ == '__main__':
    log = logging.getLogger()
    log.setLevel(logging.DEBUG)
    log_file = pathlib.Path(__file__).with_suffix('.log')
    file_formatter = logging.Formatter('[%(asctime)s] %(process)d (%(levelname)s) %(message)s')
    file_handler = logging.handlers.RotatingFileHandler(log_file, delay=True, backupCount=1, maxBytes=1000000)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(file_formatter)
    log.addHandler(file_handler)
    debug_handler = logging.handlers.RotatingFileHandler(log_file.with_suffix('.debug'),
                                                         delay=True, backupCount=1, maxBytes=1000000)
    debug_handler.setLevel(logging.DEBUG)
    debug_handler.setFormatter(file_formatter)
    log.addHandler(debug_handler)
    log.info('Script Start')

    p = argparse.ArgumentParser(
        description='Parses a HTML file for <img> tags, downloads the linked URLs, and embeds them into the HTML file.'
    )
    p.add_argument('source')
    p.add_argument('--verbose', '-v', action='store_true')
    args = p.parse_args()
    log.info('Arguments: %s', args)

    console_handler = logging.StreamHandler()
    if args.verbose:
        console_handler.setLevel(logging.INFO)
    else:
        console_handler.setLevel(logging.WARNING)
    console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))
    log.addHandler(console_handler)

    dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix))
    log.info('Destination: %s', dest)

    with open(args.source, 'r') as f:
        with open(dest, 'w') as g:
            line = f.read()
            if '<img' in line:
                log.debug('At least one <img found in this file')
                o = '<img'
                s = 'src="'
                s1 = 'src=&quot;'
                e = '"'
                e1 = '&quot;'
                alt = False
                log.debug('Processing <img')
                x = line.find(o, 0)
                log.debug('x = %s', x)
                while x > -1:
                    y = line.find(s, x) + len(s)
                    if y == -1 + len(s):
                        log.debug('s not found, looking for s1')
                        y = line.find(s1, x) + len(s1)
                        alt = True
                    if y < x:
                        raise ValueError
                    log.debug('y = %s', y)
                    if alt:
                        z = line.find(e1, y)
                        if z == -1:
                            z = line.find(e, y)
                        alt = False
                    else:
                        z = line.find(e, y)
                        if z == -1:
                            z = line.find(e1, y)
                    log.debug('z = %s', z)
                    src = line[y:z].strip()
                    log.debug('src = %s', src)
                    if src[:4] == 'http':
                        if len(src) > 40:
                            raise ValueError
                        r = requests.get(src)
                        log.debug('r = %s', r)
                        d = base64.b64encode(r.content).decode()
                        log.debug('len(d) = %s', len(d))
                        t = src[src.rfind('.'):]
                        log.debug('t = %s', t)
                        line = line.replace(src, 'data:image/' + t + ';base64,' + d)
                        log.info('Replaced %s', src)
                    x = line.find(o, x + 1)
                    log.debug('x = %s', x)
            if '&lt;img' in line:
                log.debug('At least one &lt;img found in this file')
                o = '&lt;img'
                s = 'src=&quot;'
                s1 = 'src="'
                e = '&quot;'
                e1 = '"'
                alt = False
                log.debug('Processing &lt;img')
                x = line.find(o, 0)
                log.debug('x = %s', x)
                while x > -1:
                    y = line.find(s, x) + len(s)
                    log.debug('y = %s', y)
                    if y == -1 + len(s):
                        log.debug('s not found, looking for s1')
                        y = line.find(s1, x) + len(s1)
                        alt = True
                    if y < x:
                        raise ValueError
                    log.debug('y = %s', y)
                    if alt:
                        z = line.find(e1, y)
                        if z == -1:
                            z = line.find(e, y)
                        alt = False
                    else:
                        z = line.find(e, y)
                        if z == -1:
                            z = line.find(e1, y)
                    log.debug('z = %s', z)
                    src = line[y:z].strip()
                    log.debug('src = %s', src)
                    if src[:4] == 'http':
                        if len(src) > 40:
                            raise ValueError
                        r = requests.get(src)
                        log.debug('r = %s', r)
                        d = base64.b64encode(r.content).decode()
                        log.debug('len(d) = %s', len(d))
                        t = src[src.rfind('.'):]
                        log.debug('t = %s', t)
                        line = line.replace(src, 'data:image/' + t + ';base64,' + d)
                        log.info('Replaced %s', src)
                    x = line.find(o, x + 1)
                    log.debug('x = %s', x)
            g.write(line)

requirements.txt

0 → 100644
+2 −0
Original line number Diff line number Diff line
requests~=2.29.0
 No newline at end of file