Loading imgslipstream.py +112 −93 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ import pathlib import logging import logging.handlers import base64 import re if __name__ == '__main__': log = logging.getLogger() Loading @@ -27,7 +28,9 @@ if __name__ == '__main__': description='Parses a HTML file for <img> tags, downloads the linked URLs, and embeds them into the HTML file.' ) p.add_argument('source') p.add_argument('--verbose', '-v', action='store_true') p.add_argument('--verbose', '-v', action='store_true', help='Print more logging to the console.') p.add_argument('--files', '-f', action='store_true', help='Make local files instead of embedding.') p.add_argument('--maxlen', '-m', action='store', help='Change the max length of URLs allowed (default 100).', default=100, type=int) args = p.parse_args() log.info('Arguments: %s', args) Loading @@ -41,99 +44,115 @@ if __name__ == '__main__': dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix)) log.info('Destination: %s', dest) dest_folder = dest.with_suffix('').with_stem(dest.stem + '_files') if args.files: if dest_folder.exists(): if not dest_folder.is_dir(): log.error('Destination folder conflicts with existing file: {0}'.format(str(dest_folder))) raise FileExistsError else: dest_folder.mkdir() with open(args.source, 'r') as f: with open(dest, 'w') as g: line = f.read() if '<img' in line: log.debug('At least one <img found in this file') o = '<img' s = 'src="' s1 = 'src="' e = '"' e1 = '"' alt = False log.debug('Processing <img') x = line.find(o, 0) log.debug('x = %s', x) while x > -1: y = line.find(s, x) + len(s) if y == -1 + len(s): log.debug('s not found, looking for s1') y = line.find(s1, x) + len(s1) alt = True if y < x: raise ValueError log.debug('y = %s', y) if alt: z = line.find(e1, y) if z == -1: z = line.find(e, y) alt = False with dest.open(mode='w') as g: document = f.read() img_tag = 0 log.debug('img_tag = {0}'.format(img_tag)) img_end_tag = 0 log.debug('img_end_tag = {0}'.format(img_end_tag)) while img_tag > -1: if img_end_tag < img_tag + 1: img_end_tag = img_tag + 1 log.debug('img_end_tag = {0}'.format(img_end_tag)) img_tag = document.find('img', img_end_tag + 1) log.debug('img_tag = {0}'.format(img_tag)) if img_tag < 0: log.debug('No more img tags found in file.') continue if document[img_tag - 1:img_tag] == '<': img_end_tag = document.find('>', img_tag) log.debug('img_end_tag = {0}'.format(img_end_tag)) elif document[img_tag - 4:img_tag] == '<': img_end_tag = document.find('>', img_tag) log.debug('img_end_tag = {0}'.format(img_end_tag)) elif document[img_tag:img_tag + 5] == 'img {' or document[img_tag - 1:img_tag + 4] == ',img,': log.debug('Quietly skipping a CSS tag.') continue elif document[img_tag - 2:img_tag + 9] == 'i.imgur.com' or document[img_tag - 6:img_tag + 6] == 'i.postimg.cc': log.debug('Quietly skipping a domain with img in it (not an img tag).') continue else: log.warning('img keyword found without < or <, skipped: {0}'.format(document[img_tag - 4:img_tag + 40])) continue src_attr = document.find('src', img_tag, img_end_tag) log.debug('src_attr = {0}'.format(src_attr)) if src_attr < 0: log.warning('img tag found without src: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue src_quote = document.find('"', src_attr, img_end_tag) log.debug('using main (") src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = document.find('"', src_attr, img_end_tag) log.debug('using alternate (") src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = document.find('=', src_attr, img_end_tag) log.debug('using alternate (=) src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = src_attr + 3 log.debug('using last resort (src + 3) src_quote = {0}'.format(src_quote)) src_end_quote = document.find(' ', src_quote + 1) src_quote_type = '' log.debug('using alternate ( ) src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) else: src_end_quote = document.find('"', src_quote + 6, img_end_tag) src_quote_type = '"' log.debug('using alternate (") src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) else: z = line.find(e, y) if z == -1: z = line.find(e1, y) log.debug('z = %s', z) src = line[y:z].strip() log.debug('src = %s', src) if src[:4] == 'http': if len(src) > 40: raise ValueError r = requests.get(src) log.debug('r = %s', r) d = base64.b64encode(r.content).decode() log.debug('len(d) = %s', len(d)) t = src[src.rfind('.'):] log.debug('t = %s', t) line = line.replace(src, 'data:image/' + t + ';base64,' + d) log.info('Replaced %s', src) x = line.find(o, x + 1) log.debug('x = %s', x) if '<img' in line: log.debug('At least one <img found in this file') o = '<img' s = 'src="' s1 = 'src="' e = '"' e1 = '"' alt = False log.debug('Processing <img') x = line.find(o, 0) log.debug('x = %s', x) while x > -1: y = line.find(s, x) + len(s) log.debug('y = %s', y) if y == -1 + len(s): log.debug('s not found, looking for s1') y = line.find(s1, x) + len(s1) alt = True if y < x: raise ValueError log.debug('y = %s', y) if alt: z = line.find(e1, y) if z == -1: z = line.find(e, y) alt = False src_end_quote = document.find('"', src_quote + 1, img_end_tag) src_quote_type = '"' log.debug('using main (") src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) url = document[src_quote + 1:src_end_quote].strip() log.debug('url = {0}'.format(url)) if len(url) < 5: log.warning('src attribute not long enough: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if len(url) > args.maxlen: log.warning('src attribute too long (use -m if required): {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if url[:4] != 'http': if url[:len(dest_folder.name)] == dest_folder.name or url[:11] == 'data:image/': log.debug('Quietly ignore already replaced tags.') else: log.warning('src attribute not http: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if '.' not in url: log.warning('Invalid URL, no dots (.) found: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue file_type = url[url.rfind('.') + 1:] log.debug('file_type = {0}'.format(file_type)) if file_type not in ('png', 'gif', 'jpg', 'jpeg'): log.warning('Unrecognised file type: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue req = requests.get(url) log.debug('req = {0} {1}'.format(req.status_code, req.reason)) if req.status_code != 200: log.warning('{1} {2}: {0}'.format(document[img_tag - 4:img_end_tag + 40], req.status_code, req.reason)) continue orig = document[src_attr:src_end_quote + len(src_quote_type)] count = document.count(orig) log.debug('count = {0}'.format(count)) if args.files: dest_file = dest_folder / re.sub('[^A-Za-z0-9&_.]', '_', url) log.debug('dest_file = {0}'.format(str(dest_file))) log.debug('wrote {0} bytes'.format(dest_file.write_bytes(req.content))) replacement = 'src=' + src_quote_type + dest_folder.name + '/' + dest_file.name + src_quote_type log.debug('replacement = {0}'.format(replacement)) else: z = line.find(e, y) if z == -1: z = line.find(e1, y) log.debug('z = %s', z) src = line[y:z].strip() log.debug('src = %s', src) if src[:4] == 'http': if len(src) > 40: raise ValueError r = requests.get(src) log.debug('r = %s', r) d = base64.b64encode(r.content).decode() log.debug('len(d) = %s', len(d)) t = src[src.rfind('.'):] log.debug('t = %s', t) line = line.replace(src, 'data:image/' + t + ';base64,' + d) log.info('Replaced %s', src) x = line.find(o, x + 1) log.debug('x = %s', x) g.write(line) replacement = 'src=' + src_quote_type + 'data:image/' + file_type + ';base64,' + base64.b64encode(req.content).decode() + src_quote_type log.debug('replacement[:40]...[-10:] = {0}...{1}'.format(replacement[:40], replacement[-10:])) document = document.replace(orig, replacement) log.info('Replaced {0} instances of: {1}'.format(count, orig)) g.write(document) Loading
imgslipstream.py +112 −93 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ import pathlib import logging import logging.handlers import base64 import re if __name__ == '__main__': log = logging.getLogger() Loading @@ -27,7 +28,9 @@ if __name__ == '__main__': description='Parses a HTML file for <img> tags, downloads the linked URLs, and embeds them into the HTML file.' ) p.add_argument('source') p.add_argument('--verbose', '-v', action='store_true') p.add_argument('--verbose', '-v', action='store_true', help='Print more logging to the console.') p.add_argument('--files', '-f', action='store_true', help='Make local files instead of embedding.') p.add_argument('--maxlen', '-m', action='store', help='Change the max length of URLs allowed (default 100).', default=100, type=int) args = p.parse_args() log.info('Arguments: %s', args) Loading @@ -41,99 +44,115 @@ if __name__ == '__main__': dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix)) log.info('Destination: %s', dest) dest_folder = dest.with_suffix('').with_stem(dest.stem + '_files') if args.files: if dest_folder.exists(): if not dest_folder.is_dir(): log.error('Destination folder conflicts with existing file: {0}'.format(str(dest_folder))) raise FileExistsError else: dest_folder.mkdir() with open(args.source, 'r') as f: with open(dest, 'w') as g: line = f.read() if '<img' in line: log.debug('At least one <img found in this file') o = '<img' s = 'src="' s1 = 'src="' e = '"' e1 = '"' alt = False log.debug('Processing <img') x = line.find(o, 0) log.debug('x = %s', x) while x > -1: y = line.find(s, x) + len(s) if y == -1 + len(s): log.debug('s not found, looking for s1') y = line.find(s1, x) + len(s1) alt = True if y < x: raise ValueError log.debug('y = %s', y) if alt: z = line.find(e1, y) if z == -1: z = line.find(e, y) alt = False with dest.open(mode='w') as g: document = f.read() img_tag = 0 log.debug('img_tag = {0}'.format(img_tag)) img_end_tag = 0 log.debug('img_end_tag = {0}'.format(img_end_tag)) while img_tag > -1: if img_end_tag < img_tag + 1: img_end_tag = img_tag + 1 log.debug('img_end_tag = {0}'.format(img_end_tag)) img_tag = document.find('img', img_end_tag + 1) log.debug('img_tag = {0}'.format(img_tag)) if img_tag < 0: log.debug('No more img tags found in file.') continue if document[img_tag - 1:img_tag] == '<': img_end_tag = document.find('>', img_tag) log.debug('img_end_tag = {0}'.format(img_end_tag)) elif document[img_tag - 4:img_tag] == '<': img_end_tag = document.find('>', img_tag) log.debug('img_end_tag = {0}'.format(img_end_tag)) elif document[img_tag:img_tag + 5] == 'img {' or document[img_tag - 1:img_tag + 4] == ',img,': log.debug('Quietly skipping a CSS tag.') continue elif document[img_tag - 2:img_tag + 9] == 'i.imgur.com' or document[img_tag - 6:img_tag + 6] == 'i.postimg.cc': log.debug('Quietly skipping a domain with img in it (not an img tag).') continue else: log.warning('img keyword found without < or <, skipped: {0}'.format(document[img_tag - 4:img_tag + 40])) continue src_attr = document.find('src', img_tag, img_end_tag) log.debug('src_attr = {0}'.format(src_attr)) if src_attr < 0: log.warning('img tag found without src: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue src_quote = document.find('"', src_attr, img_end_tag) log.debug('using main (") src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = document.find('"', src_attr, img_end_tag) log.debug('using alternate (") src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = document.find('=', src_attr, img_end_tag) log.debug('using alternate (=) src_quote = {0}'.format(src_quote)) if src_quote < 0: src_quote = src_attr + 3 log.debug('using last resort (src + 3) src_quote = {0}'.format(src_quote)) src_end_quote = document.find(' ', src_quote + 1) src_quote_type = '' log.debug('using alternate ( ) src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) else: src_end_quote = document.find('"', src_quote + 6, img_end_tag) src_quote_type = '"' log.debug('using alternate (") src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) else: z = line.find(e, y) if z == -1: z = line.find(e1, y) log.debug('z = %s', z) src = line[y:z].strip() log.debug('src = %s', src) if src[:4] == 'http': if len(src) > 40: raise ValueError r = requests.get(src) log.debug('r = %s', r) d = base64.b64encode(r.content).decode() log.debug('len(d) = %s', len(d)) t = src[src.rfind('.'):] log.debug('t = %s', t) line = line.replace(src, 'data:image/' + t + ';base64,' + d) log.info('Replaced %s', src) x = line.find(o, x + 1) log.debug('x = %s', x) if '<img' in line: log.debug('At least one <img found in this file') o = '<img' s = 'src="' s1 = 'src="' e = '"' e1 = '"' alt = False log.debug('Processing <img') x = line.find(o, 0) log.debug('x = %s', x) while x > -1: y = line.find(s, x) + len(s) log.debug('y = %s', y) if y == -1 + len(s): log.debug('s not found, looking for s1') y = line.find(s1, x) + len(s1) alt = True if y < x: raise ValueError log.debug('y = %s', y) if alt: z = line.find(e1, y) if z == -1: z = line.find(e, y) alt = False src_end_quote = document.find('"', src_quote + 1, img_end_tag) src_quote_type = '"' log.debug('using main (") src_end_quote = {0}, src_quote_type = {1}'.format(src_end_quote, src_quote_type)) url = document[src_quote + 1:src_end_quote].strip() log.debug('url = {0}'.format(url)) if len(url) < 5: log.warning('src attribute not long enough: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if len(url) > args.maxlen: log.warning('src attribute too long (use -m if required): {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if url[:4] != 'http': if url[:len(dest_folder.name)] == dest_folder.name or url[:11] == 'data:image/': log.debug('Quietly ignore already replaced tags.') else: log.warning('src attribute not http: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue if '.' not in url: log.warning('Invalid URL, no dots (.) found: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue file_type = url[url.rfind('.') + 1:] log.debug('file_type = {0}'.format(file_type)) if file_type not in ('png', 'gif', 'jpg', 'jpeg'): log.warning('Unrecognised file type: {0}'.format(document[img_tag - 4:img_end_tag + 40])) continue req = requests.get(url) log.debug('req = {0} {1}'.format(req.status_code, req.reason)) if req.status_code != 200: log.warning('{1} {2}: {0}'.format(document[img_tag - 4:img_end_tag + 40], req.status_code, req.reason)) continue orig = document[src_attr:src_end_quote + len(src_quote_type)] count = document.count(orig) log.debug('count = {0}'.format(count)) if args.files: dest_file = dest_folder / re.sub('[^A-Za-z0-9&_.]', '_', url) log.debug('dest_file = {0}'.format(str(dest_file))) log.debug('wrote {0} bytes'.format(dest_file.write_bytes(req.content))) replacement = 'src=' + src_quote_type + dest_folder.name + '/' + dest_file.name + src_quote_type log.debug('replacement = {0}'.format(replacement)) else: z = line.find(e, y) if z == -1: z = line.find(e1, y) log.debug('z = %s', z) src = line[y:z].strip() log.debug('src = %s', src) if src[:4] == 'http': if len(src) > 40: raise ValueError r = requests.get(src) log.debug('r = %s', r) d = base64.b64encode(r.content).decode() log.debug('len(d) = %s', len(d)) t = src[src.rfind('.'):] log.debug('t = %s', t) line = line.replace(src, 'data:image/' + t + ';base64,' + d) log.info('Replaced %s', src) x = line.find(o, x + 1) log.debug('x = %s', x) g.write(line) replacement = 'src=' + src_quote_type + 'data:image/' + file_type + ';base64,' + base64.b64encode(req.content).decode() + src_quote_type log.debug('replacement[:40]...[-10:] = {0}...{1}'.format(replacement[:40], replacement[-10:])) document = document.replace(orig, replacement) log.info('Replaced {0} instances of: {1}'.format(count, orig)) g.write(document)