Commit 9259c01e authored by The Heavy's avatar The Heavy 🚂
Browse files

Rewrite it better, better logging, alerts, more options, etc.

parent 4c6206af
Loading
Loading
Loading
Loading
+112 −93
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import pathlib
import logging
import logging.handlers
import base64
import re

if __name__ == '__main__':
    log = logging.getLogger()
@@ -27,7 +28,9 @@ if __name__ == '__main__':
        description='Parses a HTML file for <img> tags, downloads the linked URLs, and embeds them into the HTML file.'
    )
    p.add_argument('source')
    p.add_argument('--verbose', '-v', action='store_true')
    p.add_argument('--verbose', '-v', action='store_true', help='Print more logging to the console.')
    p.add_argument('--files', '-f', action='store_true', help='Make local files instead of embedding.')
    p.add_argument('--maxlen', '-m', action='store', help='Change the max length of URLs allowed (default 100).', default=100, type=int)
    args = p.parse_args()
    log.info('Arguments: %s', args)

@@ -41,99 +44,115 @@ if __name__ == '__main__':

    dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix))
    log.info('Destination: %s', dest)
    dest_folder = dest.with_suffix('').with_stem(dest.stem + '_files')
    if args.files:
        if dest_folder.exists():
            if not dest_folder.is_dir():
                log.error('Destination folder conflicts with existing file: {0}'.format(str(dest_folder)))
                raise FileExistsError
        else:
            dest_folder.mkdir()

    with open(args.source, 'r') as f:
        with open(dest, 'w') as g:
            line = f.read()
            if '<img' in line:
                log.debug('At least one <img found in this file')
                o = '<img'
                s = 'src="'
                s1 = 'src=&quot;'
                e = '"'
                e1 = '&quot;'
                alt = False
                log.debug('Processing <img')
                x = line.find(o, 0)
                log.debug('x = %s', x)
                while x > -1:
                    y = line.find(s, x) + len(s)
                    if y == -1 + len(s):
                        log.debug('s not found, looking for s1')
                        y = line.find(s1, x) + len(s1)
                        alt = True
                    if y < x:
                        raise ValueError
                    log.debug('y = %s', y)
                    if alt:
                        z = line.find(e1, y)
                        if z == -1:
                            z = line.find(e, y)
                        alt = False
        with dest.open(mode='w') as g:
            document = f.read()
            img_tag = 0
            log.debug('img_tag = {0}'.format(img_tag))
            img_end_tag = 0
            log.debug('img_end_tag = {0}'.format(img_end_tag))
            while img_tag > -1:
                if img_end_tag < img_tag + 1:
                    img_end_tag = img_tag + 1
                    log.debug('img_end_tag = {0}'.format(img_end_tag))
                img_tag = document.find('img', img_end_tag + 1)
                log.debug('img_tag = {0}'.format(img_tag))
                if img_tag < 0:
                    log.debug('No more img tags found in file.')
                    continue
                if document[img_tag - 1:img_tag] == '<':
                    img_end_tag = document.find('>', img_tag)
                    log.debug('img_end_tag = {0}'.format(img_end_tag))
                elif document[img_tag - 4:img_tag] == '&lt;':
                    img_end_tag = document.find('&gt;', img_tag)
                    log.debug('img_end_tag = {0}'.format(img_end_tag))
                elif document[img_tag:img_tag + 5] == 'img {' or document[img_tag - 1:img_tag + 4] == ',img,':
                    log.debug('Quietly skipping a CSS tag.')
                    continue
                elif document[img_tag - 2:img_tag + 9] == 'i.imgur.com' or document[img_tag - 6:img_tag + 6] == 'i.postimg.cc':
                    log.debug('Quietly skipping a domain with img in it (not an img tag).')
                    continue
                else:
                    log.warning('img keyword found without &lt; or <, skipped: {0}'.format(document[img_tag - 4:img_tag + 40]))
                    continue

                src_attr = document.find('src', img_tag, img_end_tag)
                log.debug('src_attr = {0}'.format(src_attr))
                if src_attr < 0:
                    log.warning('img tag found without src: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue
                src_quote = document.find('"', src_attr, img_end_tag)
                log.debug('using main (") src_quote = {0}'.format(src_quote))
                if src_quote < 0:
                    src_quote = document.find('&quot;', src_attr, img_end_tag)
                    log.debug('using alternate (&quot;) src_quote = {0}'.format(src_quote))
                    if src_quote < 0:
                        src_quote = document.find('=', src_attr, img_end_tag)
                        log.debug('using alternate (=) src_quote = {0}'.format(src_quote))
                        if src_quote < 0:
                            src_quote = src_attr + 3
                            log.debug('using last resort (src + 3) src_quote = {0}'.format(src_quote))
                        src_end_quote = document.find(' ', src_quote + 1)
                        src_quote_type = ''
                        log.debug('using alternate ( ) src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type))
                    else:
                        src_end_quote = document.find('&quot;', src_quote + 6, img_end_tag)
                        src_quote_type = '&quot;'
                        log.debug('using alternate (&quot;) src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type))
                else:
                        z = line.find(e, y)
                        if z == -1:
                            z = line.find(e1, y)
                    log.debug('z = %s', z)
                    src = line[y:z].strip()
                    log.debug('src = %s', src)
                    if src[:4] == 'http':
                        if len(src) > 40:
                            raise ValueError
                        r = requests.get(src)
                        log.debug('r = %s', r)
                        d = base64.b64encode(r.content).decode()
                        log.debug('len(d) = %s', len(d))
                        t = src[src.rfind('.'):]
                        log.debug('t = %s', t)
                        line = line.replace(src, 'data:image/' + t + ';base64,' + d)
                        log.info('Replaced %s', src)
                    x = line.find(o, x + 1)
                    log.debug('x = %s', x)
            if '&lt;img' in line:
                log.debug('At least one &lt;img found in this file')
                o = '&lt;img'
                s = 'src=&quot;'
                s1 = 'src="'
                e = '&quot;'
                e1 = '"'
                alt = False
                log.debug('Processing &lt;img')
                x = line.find(o, 0)
                log.debug('x = %s', x)
                while x > -1:
                    y = line.find(s, x) + len(s)
                    log.debug('y = %s', y)
                    if y == -1 + len(s):
                        log.debug('s not found, looking for s1')
                        y = line.find(s1, x) + len(s1)
                        alt = True
                    if y < x:
                        raise ValueError
                    log.debug('y = %s', y)
                    if alt:
                        z = line.find(e1, y)
                        if z == -1:
                            z = line.find(e, y)
                        alt = False
                    src_end_quote = document.find('"', src_quote + 1, img_end_tag)
                    src_quote_type = '"'
                    log.debug('using main (") src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type))
                url = document[src_quote + 1:src_end_quote].strip()
                log.debug('url = {0}'.format(url))
                if len(url) < 5:
                    log.warning('src attribute not long enough: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue
                if len(url) > args.maxlen:
                    log.warning('src attribute too long (use -m if required): {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue
                if url[:4] != 'http':
                    if url[:len(dest_folder.name)] == dest_folder.name or url[:11] == 'data:image/':
                        log.debug('Quietly ignore already replaced tags.')
                    else:
                        log.warning('src attribute not http: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue
                if '.' not in url:
                    log.warning('Invalid URL, no dots (.) found: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue

                file_type = url[url.rfind('.') + 1:]
                log.debug('file_type = {0}'.format(file_type))
                if file_type not in ('png', 'gif', 'jpg', 'jpeg'):
                    log.warning('Unrecognised file type: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                    continue

                req = requests.get(url)
                log.debug('req = {0} {1}'.format(req.status_code, req.reason))
                if req.status_code != 200:
                    log.warning('{1} {2}: {0}'.format(document[img_tag - 4:img_end_tag + 40], req.status_code, req.reason))
                    continue
                orig = document[src_attr:src_end_quote + len(src_quote_type)]
                count = document.count(orig)
                log.debug('count = {0}'.format(count))
                if args.files:
                    dest_file = dest_folder / re.sub('[^A-Za-z0-9&_.]', '_', url)
                    log.debug('dest_file = {0}'.format(str(dest_file)))
                    log.debug('wrote {0} bytes'.format(dest_file.write_bytes(req.content)))
                    replacement = 'src=' + src_quote_type + dest_folder.name + '/' + dest_file.name + src_quote_type
                    log.debug('replacement = {0}'.format(replacement))
                else:
                        z = line.find(e, y)
                        if z == -1:
                            z = line.find(e1, y)
                    log.debug('z = %s', z)
                    src = line[y:z].strip()
                    log.debug('src = %s', src)
                    if src[:4] == 'http':
                        if len(src) > 40:
                            raise ValueError
                        r = requests.get(src)
                        log.debug('r = %s', r)
                        d = base64.b64encode(r.content).decode()
                        log.debug('len(d) = %s', len(d))
                        t = src[src.rfind('.'):]
                        log.debug('t = %s', t)
                        line = line.replace(src, 'data:image/' + t + ';base64,' + d)
                        log.info('Replaced %s', src)
                    x = line.find(o, x + 1)
                    log.debug('x = %s', x)
            g.write(line)
                    replacement = 'src=' + src_quote_type + 'data:image/' + file_type + ';base64,' + base64.b64encode(req.content).decode() + src_quote_type
                    log.debug('replacement[:40]...[-10:] = {0}...{1}'.format(replacement[:40], replacement[-10:]))
                document = document.replace(orig, replacement)
                log.info('Replaced {0} instances of: {1}'.format(count, orig))
            g.write(document)