From bb18097c97873826680e09758eb2bbc20a81584e Mon Sep 17 00:00:00 2001 From: mister-monster <38917788+mister-monster@users.noreply.github.com> Date: Fri, 22 Nov 2019 16:33:20 -0600 Subject: [PATCH] closed some issues added support for tags, YT channel language fetching and peertube categories --- youtube2peertube.py | 510 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 498 insertions(+), 12 deletions(-) diff --git a/youtube2peertube.py b/youtube2peertube.py index d6c2e9d..72a0c68 100644 --- a/youtube2peertube.py +++ b/youtube2peertube.py @@ -15,6 +15,8 @@ import utils def get_video_data(channel_id): yt_rss_url = "https://www.youtube.com/feeds/videos.xml?channel_id=" + channel_id feed = fp.parse(yt_rss_url) + channel_lang = feed["feed"]["title_detail"]["language"] + print(feed["feed"]) entries = feed["entries"] channels_timestamps = "channels_timestamps.csv" # clear any existing queue before start @@ -65,6 +67,257 @@ def get_video_data(channel_id): if line != '': ct.write(line + "\n") ct.close() + return queue, channel_lang + +def download_yt_video(queue_item, dl_dir, channel_conf): + url = queue_item["link"] + dl_dir = dl_dir + channel_conf["name"] + try: + video = pafy.new(url) + streams = video.streams + #for s in streams: + #print(s.resolution, s.extension, s.get_filesize, s.url) + best = video.getbest(preftype=channel_conf["preferred_extension"]) + filepath = dl_dir + "/"+ queue_item["yt_videoid"] + "." + channel_conf["preferred_extension"] + #TODO: implement resolution logic from config, currently downloading best resolution + best.download(filepath=filepath, quiet=False) + + except: + pass + # TODO: check YT alternate URL for video availability + # TODO: print and log exceptions + +def save_metadata(queue_item, dl_dir, channel_conf): + dl_dir = dl_dir + channel_conf["name"] + link = queue_item["link"] + title = queue_item["title"] + description = queue_item["summary"] + author = queue_item["author"] + published = queue_item["published"] + metadata_file = dl_dir + "/" + queue_item["yt_videoid"] + ".txt" + metadata = open(metadata_file, "w+") + # save relevant metadata as semicolon separated easy to read values to text file + metadata.write('title: "' + title + '";\n\nlink: "' + link + '";\n\nauthor: "' + author + '";\n\npublished: "' + + published + '";\n\ndescription: "' + description + '"\n\n;') + # save raw metadata JSON string + metadata.write(str(queue_item)) + metadata.close() + +def save_thumbnail(queue_item, dl_dir, channel_conf): + dl_dir = dl_dir + channel_conf["name"] + thumb = str(queue_item["media_thumbnail"][0]["url"]) + extension = thumb.split(".")[-1] + thumb_file = dl_dir + "/" + queue_item["yt_videoid"] + "." + extension + # download the thumbnail + urlretrieve(thumb, thumb_file) + return extension + +def get_pt_auth(channel_conf): + # get variables from channel_conf + pt_api = channel_conf["peertube_instance"] + "/api/v1" + pt_uname = channel_conf["peertube_username"] + pt_passwd = channel_conf["peertube_password"] + # get client ID and secret from peertube instance + id_secret = json.loads(str(requests.get(pt_api + "/oauth-clients/local").content).split("'")[1]) + client_id = id_secret["client_id"] + client_secret = id_secret["client_secret"] + # construct JSON for post request to get access token + auth_json = {'client_id': client_id, + 'client_secret': client_secret, + 'grant_type': 'password', + 'response_type': 'code', + 'username': pt_uname, + 'password': pt_passwd + } + # get access token + auth_result = json.loads(str(requests.post(pt_api + "/users/token", data=auth_json).content).split("'")[1]) + access_token = auth_result["access_token"] + return access_token + +def get_pt_channel_id(channel_conf): + pt_api = channel_conf["peertube_instance"] + "/api/v1" + post_url = pt_api + "/video-channels/" + channel_conf["peertube_channel"] + "/" + returned_json = json.loads(requests.get(post_url).content) + channel_id = returned_json["id"] + return channel_id + +def get_file(file_path): + mimetypes.init() + return (path.basename(file_path), open(path.abspath(file_path), 'rb'), + mimetypes.types_map[path.splitext(file_path)[1]]) + +def upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension): + # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia + pt_api = channel_conf["peertube_instance"] + "/api/v1" + video_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + \ + channel_conf["preferred_extension"] + thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension + description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"] + channel_id = str(get_pt_channel_id(channel_conf)) + # We need to transform fields into tuple to deal with tags as + # MultipartEncoder does not support list refer + # https://github.com/requests/toolbelt/issues/190 and + # https://github.com/requests/toolbelt/issues/205 + fields = [ + ("name", queue_item["title"]), + ("licence", "1"), + ("description", description), + ("nsfw", channel_conf["nsfw"]), + ("channelId", channel_id), + ("originallyPublishedAt", queue_item["published"]), + ("category", channel_conf["pt_channel_category"]), + ("language", channel_conf["default_lang"]), + ("privacy", str(channel_conf["pt_privacy"])), + ("commentsEnabled", channel_conf["comments_enabled"]), + ("videofile", get_file(video_file)), + ("thumbnailfile", get_file(thumb_file)), + ("previewfile", get_file(thumb_file)), + ("waitTranscoding", 'false') + ] + + if channel_conf["pt_tags"] != "": + fields.append(("tags", "[" + channel_conf["pt_tags"] + "]")) + else: + print("you have no tags in your configuration file for this channel") + multipart_data = MultipartEncoder(fields) + headers = { + 'Content-Type': multipart_data.content_type, + 'Authorization': "Bearer " + access_token + } + print(requests.post(pt_api + "/videos/upload", data=multipart_data, headers=headers).content) + +def pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang): + # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia + pt_api = channel_conf["peertube_instance"] + "/api/v1" + yt_video_url = queue_item["link"] + # TODO: use the alternate link if video not found error occurs + alternate_link = queue_item["links"][0]["href"] + thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension + description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"] + channel_id = str(get_pt_channel_id(channel_conf)) + language = utils.set_pt_lang(yt_lang, channel_conf["default_lang"]) + category = utils.set_pt_category(channel_conf["pt_channel_category"]) + # We need to transform fields into tuple to deal with tags as + # MultipartEncoder does not support list refer + # https://github.com/requests/toolbelt/issues/190 and + # https://github.com/requests/toolbelt/issues/205 + fields = [ + ("name", queue_item["title"]), + ("licence", "1"), + ("description", description), + ("nsfw", channel_conf["nsfw"]), + ("channelId", channel_id), + ("originallyPublishedAt", queue_item["published"]), + ("category", category), + ("language", language), + ("privacy", str(channel_conf["pt_privacy"])), + ("commentsEnabled", channel_conf["comments_enabled"]), + ("targetUrl", yt_video_url), + ("thumbnailfile", get_file(thumb_file)), + ("previewfile", get_file(thumb_file)), + ("waitTranscoding", 'false') + ] + if channel_conf["pt_tags"] != "": + fields.append(("tags[]", channel_conf["pt_tags"])) + else: + print("you have no tags in your configuration file for this channel") + multipart_data = MultipartEncoder(fields) + headers = { + 'Content-Type': multipart_data.content_type, + 'Authorization': "Bearer " + access_token + } + print(requests.post(pt_api + "/videos/imports", data=multipart_data, headers=headers).content) + +def run_steps(conf): + # TODO: logging + channel = conf["channel"] + # run loop for every channel in the configuration file + global_conf = conf["global"] + if conf["global"]["delete_videos"] == "true": + delete_videos = True + else: + delete_videos = False + # The following enables the deletion of thumbnails, videos are not downloaded at all + if conf["global"]["use_pt_http_import"] == "true": + delete_videos = True + use_pt_http_import = True + else: + use_pt_http_import = False + dl_dir = global_conf["video_download_dir"] + if not path.exists(dl_dir): + mkdir(dl_dir) + channel_counter = 0 + for c in channel: + print("\n") + channel_id = channel[c]["channel_id"] + channel_conf = channel[str(channel_counter)] + video_data = get_video_data(channel_id) + queue = video_data[0] + yt_lang = video_data[1] + if len(queue) > 0: + if not path.exists(dl_dir + "/" + channel_conf["name"]): + mkdir(dl_dir + "/" + channel_conf["name"]) + # download videos, metadata and thumbnails from youtube + for queue_item in queue: + if not use_pt_http_import: + print("downloading " + queue_item["yt_videoid"] + " from YouTube...") + download_yt_video(queue_item, dl_dir, channel_conf) + print("done.") + # TODO: download closest to config specified resolution instead of best resolution + thumb_extension = save_thumbnail(queue_item, dl_dir, channel_conf) + # only save metadata to text file if archiving videos + if not delete_videos: + print("saving video metadata...") + save_metadata(queue_item, dl_dir, channel_conf) + print("done.") + access_token = get_pt_auth(channel_conf) + # upload videos, metadata and thumbnails to peertube + for queue_item in queue: + if not use_pt_http_import: + print("uploading " + queue_item["yt_videoid"] + " to Peertube...") + upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension) + print("done.") + else: + print("mirroring " + queue_item["link"] + " to Peertube using HTTP import...") + pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang) + print("done.") + if delete_videos: + print("deleting videos and/or thumbnails...") + rmtree(dl_dir + "/" + channel_conf["name"], ignore_errors=True) + print("done") + channel_counter += 1 + +def run(run_once=True): + #TODO: turn this into a daemon + conf = utils.read_conf("config.toml") + if run_once: + run_steps(conf) + else: + while True: + poll_frequency = int(conf["global"]["poll_frequency"]) * 60 + run_steps(conf) + sleep(poll_frequency) + +if __name__ == "__main__": + run(run_once=False) + + published_int = utils.convert_timestamp(published) + ctr_line_list = ctr_line.split(",") + line_published_int = utils.convert_timestamp(ctr_line_list[1]) + if published_int > line_published_int: + # update the timestamp in the line for the channel in channels_timestamps, + ctr.remove(ctr_line) + ctr_line = str(channel_id + "," + published + "," + updated + '\n') + ctr.append(ctr_line) + # and add current videos to queue. + queue.append(i) + print(published) + # write the new channels and timestamps line to channels_timestamps.csv + ct = open(channels_timestamps, "w") + for line in ctr: + if line != '': + ct.write(line + "\n") + ct.close() return queue def download_yt_video(e, dl_dir, channel_conf): @@ -224,6 +477,237 @@ def pt_http_import(dl_dir, channel_conf, e, access_token, thumb_extension): print(requests.post(pt_api + "/videos/imports", data=multipart_data, headers=headers).content) def run_steps(conf): +#!/usr/bin/python3 + +import pafy +import feedparser as fp +from urllib.request import urlretrieve +import requests +import json +from time import sleep +from os import mkdir, path +from shutil import rmtree +import mimetypes +from requests_toolbelt.multipart.encoder import MultipartEncoder +import utils + +def get_video_data(channel_id): + yt_rss_url = "https://www.youtube.com/feeds/videos.xml?channel_id=" + channel_id + feed = fp.parse(yt_rss_url) + channel_lang = feed["feed"]["title_detail"]["language"] + print(feed["feed"]) + entries = feed["entries"] + channels_timestamps = "channels_timestamps.csv" + # clear any existing queue before start + queue = [] + # read contents of channels_timestamps.csv, create list object of contents + ct = open(channels_timestamps, "r") + ctr = ct.read().split("\n") + ct.close() + ctr_line = [] + channel_found = False + # check if channel ID is found in channels_timestamps.csv + for line in ctr: + line_list = line.split(',') + if channel_id == line_list[0]: + channel_found = True + ctr_line = line + break + if not channel_found: + print("new channel added to config: " + channel_id) + print(channel_id) + # iterate through video entries for channel, parse data into objects for use + for pos, i in enumerate(reversed(entries)): + published = i["published"] + updated = i["updated"] + if not channel_found: + # add the video to the queue + queue.append(i) + ctr_line = str(channel_id + "," + published + "," + updated + '\n') + # add the new line to ctr for adding to channels_timestamps later + ctr.append(ctr_line) + channel_found = True + # if the channel exists in channels_timestamps, update "published" time in the channel line + else: + published_int = utils.convert_timestamp(published) + ctr_line_list = ctr_line.split(",") + line_published_int = utils.convert_timestamp(ctr_line_list[1]) + if published_int > line_published_int: + # update the timestamp in the line for the channel in channels_timestamps, + ctr.remove(ctr_line) + ctr_line = str(channel_id + "," + published + "," + updated + '\n') + ctr.append(ctr_line) + # and add current videos to queue. + queue.append(i) + print(published) + # write the new channels and timestamps line to channels_timestamps.csv + ct = open(channels_timestamps, "w") + for line in ctr: + if line != '': + ct.write(line + "\n") + ct.close() + return queue, channel_lang + +def download_yt_video(queue_item, dl_dir, channel_conf): + url = queue_item["link"] + dl_dir = dl_dir + channel_conf["name"] + try: + video = pafy.new(url) + streams = video.streams + #for s in streams: + #print(s.resolution, s.extension, s.get_filesize, s.url) + best = video.getbest(preftype=channel_conf["preferred_extension"]) + filepath = dl_dir + "/"+ queue_item["yt_videoid"] + "." + channel_conf["preferred_extension"] + #TODO: implement resolution logic from config, currently downloading best resolution + best.download(filepath=filepath, quiet=False) + + except: + pass + # TODO: check YT alternate URL for video availability + # TODO: print and log exceptions + +def save_metadata(queue_item, dl_dir, channel_conf): + dl_dir = dl_dir + channel_conf["name"] + link = queue_item["link"] + title = queue_item["title"] + description = queue_item["summary"] + author = queue_item["author"] + published = queue_item["published"] + metadata_file = dl_dir + "/" + queue_item["yt_videoid"] + ".txt" + metadata = open(metadata_file, "w+") + # save relevant metadata as semicolon separated easy to read values to text file + metadata.write('title: "' + title + '";\n\nlink: "' + link + '";\n\nauthor: "' + author + '";\n\npublished: "' + + published + '";\n\ndescription: "' + description + '"\n\n;') + # save raw metadata JSON string + metadata.write(str(queue_item)) + metadata.close() + +def save_thumbnail(queue_item, dl_dir, channel_conf): + dl_dir = dl_dir + channel_conf["name"] + thumb = str(queue_item["media_thumbnail"][0]["url"]) + extension = thumb.split(".")[-1] + thumb_file = dl_dir + "/" + queue_item["yt_videoid"] + "." + extension + # download the thumbnail + urlretrieve(thumb, thumb_file) + return extension + +def get_pt_auth(channel_conf): + # get variables from channel_conf + pt_api = channel_conf["peertube_instance"] + "/api/v1" + pt_uname = channel_conf["peertube_username"] + pt_passwd = channel_conf["peertube_password"] + # get client ID and secret from peertube instance + id_secret = json.loads(str(requests.get(pt_api + "/oauth-clients/local").content).split("'")[1]) + client_id = id_secret["client_id"] + client_secret = id_secret["client_secret"] + # construct JSON for post request to get access token + auth_json = {'client_id': client_id, + 'client_secret': client_secret, + 'grant_type': 'password', + 'response_type': 'code', + 'username': pt_uname, + 'password': pt_passwd + } + # get access token + auth_result = json.loads(str(requests.post(pt_api + "/users/token", data=auth_json).content).split("'")[1]) + access_token = auth_result["access_token"] + return access_token + +def get_pt_channel_id(channel_conf): + pt_api = channel_conf["peertube_instance"] + "/api/v1" + post_url = pt_api + "/video-channels/" + channel_conf["peertube_channel"] + "/" + returned_json = json.loads(requests.get(post_url).content) + channel_id = returned_json["id"] + return channel_id + +def get_file(file_path): + mimetypes.init() + return (path.basename(file_path), open(path.abspath(file_path), 'rb'), + mimetypes.types_map[path.splitext(file_path)[1]]) + +def upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension): + # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia + pt_api = channel_conf["peertube_instance"] + "/api/v1" + video_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + \ + channel_conf["preferred_extension"] + thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension + description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"] + channel_id = str(get_pt_channel_id(channel_conf)) + # We need to transform fields into tuple to deal with tags as + # MultipartEncoder does not support list refer + # https://github.com/requests/toolbelt/issues/190 and + # https://github.com/requests/toolbelt/issues/205 + fields = [ + ("name", queue_item["title"]), + ("licence", "1"), + ("description", description), + ("nsfw", channel_conf["nsfw"]), + ("channelId", channel_id), + ("originallyPublishedAt", queue_item["published"]), + ("category", channel_conf["pt_channel_category"]), + ("language", channel_conf["default_lang"]), + ("privacy", str(channel_conf["pt_privacy"])), + ("commentsEnabled", channel_conf["comments_enabled"]), + ("videofile", get_file(video_file)), + ("thumbnailfile", get_file(thumb_file)), + ("previewfile", get_file(thumb_file)), + ("waitTranscoding", 'false') + ] + + if channel_conf["pt_tags"] != "": + fields.append(("tags", "[" + channel_conf["pt_tags"] + "]")) + else: + print("you have no tags in your configuration file for this channel") + multipart_data = MultipartEncoder(fields) + headers = { + 'Content-Type': multipart_data.content_type, + 'Authorization': "Bearer " + access_token + } + print(requests.post(pt_api + "/videos/upload", data=multipart_data, headers=headers).content) + +def pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang): + # Adapted from Prismedia https://git.lecygnenoir.info/LecygneNoir/prismedia + pt_api = channel_conf["peertube_instance"] + "/api/v1" + yt_video_url = queue_item["link"] + # TODO: use the alternate link if video not found error occurs + alternate_link = queue_item["links"][0]["href"] + thumb_file = dl_dir + channel_conf["name"] + "/" + queue_item["yt_videoid"] + "." + thumb_extension + description = channel_conf["description_prefix"] + "\n\n" + queue_item["summary"] + "\n\n" + channel_conf["description_suffix"] + channel_id = str(get_pt_channel_id(channel_conf)) + language = utils.set_pt_lang(yt_lang, channel_conf["default_lang"]) + category = utils.set_pt_category(channel_conf["pt_channel_category"]) + # We need to transform fields into tuple to deal with tags as + # MultipartEncoder does not support list refer + # https://github.com/requests/toolbelt/issues/190 and + # https://github.com/requests/toolbelt/issues/205 + fields = [ + ("name", queue_item["title"]), + ("licence", "1"), + ("description", description), + ("nsfw", channel_conf["nsfw"]), + ("channelId", channel_id), + ("originallyPublishedAt", queue_item["published"]), + ("category", category), + ("language", language), + ("privacy", str(channel_conf["pt_privacy"])), + ("commentsEnabled", channel_conf["comments_enabled"]), + ("targetUrl", yt_video_url), + ("thumbnailfile", get_file(thumb_file)), + ("previewfile", get_file(thumb_file)), + ("waitTranscoding", 'false') + ] + if channel_conf["pt_tags"] != "": + fields.append(("tags[]", channel_conf["pt_tags"])) + else: + print("you have no tags in your configuration file for this channel") + multipart_data = MultipartEncoder(fields) + headers = { + 'Content-Type': multipart_data.content_type, + 'Authorization': "Bearer " + access_token + } + print(requests.post(pt_api + "/videos/imports", data=multipart_data, headers=headers).content) + +def run_steps(conf): # TODO: logging channel = conf["channel"] # run loop for every channel in the configuration file @@ -246,33 +730,35 @@ def run_steps(conf): print("\n") channel_id = channel[c]["channel_id"] channel_conf = channel[str(channel_counter)] - queue = get_video_data(channel_id) + video_data = get_video_data(channel_id) + queue = video_data[0] + yt_lang = video_data[1] if len(queue) > 0: if not path.exists(dl_dir + "/" + channel_conf["name"]): mkdir(dl_dir + "/" + channel_conf["name"]) # download videos, metadata and thumbnails from youtube - for item in queue: + for queue_item in queue: if not use_pt_http_import: - print("downloading " + item["yt_videoid"] + " from YouTube...") - download_yt_video(item, dl_dir, channel_conf) + print("downloading " + queue_item["yt_videoid"] + " from YouTube...") + download_yt_video(queue_item, dl_dir, channel_conf) print("done.") # TODO: download closest to config specified resolution instead of best resolution - thumb_extension = save_thumbnail(item, dl_dir, channel_conf) + thumb_extension = save_thumbnail(queue_item, dl_dir, channel_conf) # only save metadata to text file if archiving videos if not delete_videos: print("saving video metadata...") - save_metadata(item, dl_dir, channel_conf) + save_metadata(queue_item, dl_dir, channel_conf) print("done.") access_token = get_pt_auth(channel_conf) # upload videos, metadata and thumbnails to peertube - for item in queue: + for queue_item in queue: if not use_pt_http_import: - print("uploading " + item["yt_videoid"] + " to Peertube...") - upload_to_pt(dl_dir, channel_conf, item, access_token, thumb_extension) + print("uploading " + queue_item["yt_videoid"] + " to Peertube...") + upload_to_pt(dl_dir, channel_conf, queue_item, access_token, thumb_extension) print("done.") else: - print("mirroring " + item["link"] + " to Peertube using HTTP import...") - pt_http_import(dl_dir, channel_conf, item, access_token, thumb_extension) + print("mirroring " + queue_item["link"] + " to Peertube using HTTP import...") + pt_http_import(dl_dir, channel_conf, queue_item, access_token, thumb_extension, yt_lang) print("done.") if delete_videos: print("deleting videos and/or thumbnails...") @@ -281,7 +767,7 @@ def run_steps(conf): channel_counter += 1 def run(run_once=True): - #TODO: turn this into a cron job + #TODO: turn this into a daemon conf = utils.read_conf("config.toml") if run_once: run_steps(conf)