Commit bf930632 authored by The Heavy's avatar The Heavy 🚂
Browse files

Added ability to specify a base URL to handle relative links.

parent a5cb6001
Loading
Loading
Loading
Loading
+25 −2
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@ if __name__ == '__main__':
    p.add_argument('--files', '-f', action='store_true', help='Make local files instead of embedding.')
    p.add_argument('--maxlen', '-m', action='store', help='Change the max length of URLs allowed (default 100).',
                   default=100, type=int)
    p.add_argument('--url', '-u', action='store', help='Provide a relative URL where files can be downloaded from.')
    args = p.parse_args()
    log.info('Arguments: %s', args)

@@ -43,6 +44,20 @@ if __name__ == '__main__':
    console_handler.setFormatter(logging.Formatter('[%(asctime)s] %(message)s'))
    log.addHandler(console_handler)

    if args.url is not None:
        if args.url[:4] != 'http':
            log.error('Unsupported protocol for URL argument.')
            raise argparse.ArgumentError
        if '://' not in args.url:
            log.error('URL argument is not a valid URL.')
            raise argparse.ArgumentError
        if '.' not in args.url.split('/')[2]:
            log.error('Invalid domain for URL argument.')
            raise argparse.ArgumentError
        if args.url[-1:] != '/':
            log.debug('Adding ending slash to URL argument.')
            args.url = args.url + '/'

    dest = pathlib.Path(args.source).with_suffix('.slipstream' + str(pathlib.Path(args.source).suffix))
    log.info('Destination: %s', dest)
    dest_folder = dest.with_suffix('').with_stem(dest.stem + '_files')
@@ -114,7 +129,8 @@ if __name__ == '__main__':
                            )
                        )
                    else:
                        src_end_quote = document.find('"', src_quote + 6, img_end_tag)
                        src_quote = src_quote + 5
                        src_end_quote = document.find('"', src_quote + 1, img_end_tag)
                        src_quote_type = '"'
                        log.debug(
                            'using alternate (") src_end_quote = {0}, src_quote_type = {1}'.format(
@@ -137,6 +153,13 @@ if __name__ == '__main__':
                if url[:4] != 'http':
                    if url[:len(dest_folder.name)] == dest_folder.name or url[:11] == 'data:image/':
                        log.debug('Quietly ignore already replaced tags.')
                        continue
                    elif args.url is not None:
                        if url[:1] == '/':
                            url = args.url[:args.url.find('/', 8)] + url
                        else:
                            url = args.url + url
                        log.debug('Attemping relative url retrieval with url: {0}'.format(url))
                    else:
                        log.warning('src attribute not http: {0}'.format(document[img_tag - 4:img_end_tag + 40]))
                        continue