initial commit

2024-01-11 17:30:55 +11:00
commit dd04b9f29f
9 changed files with 1849 additions and 0 deletions
--- a/src/download.rs
+++ b/src/download.rs
@@ -0,0 +1,352 @@
+use std::fs;
+use std::path;
+use std::borrow::Cow;
+use std::collections::{HashMap, BTreeMap, HashSet};
+
+use anyhow::Context;
+use sanitise_file_name::sanitise;
+
+use crate::rss;
+
+#[derive(Default, serde::Serialize, serde::Deserialize)]
+struct Specification<'a> {
+    files : HashMap<Cow<'a, str>, Cow<'a, path::Path>>,
+    feed : BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>,
+    image_url : Option<Cow<'a, str>>,
+}
+
+impl<'a> Specification<'a> {
+    fn read_from(path : &path::Path) -> Result<Self, anyhow::Error> {
+        Ok(if path.is_file() {
+            toml::from_str(&fs::read_to_string(&path)?[..])?
+        } else {
+            Specification::default()
+        })
+    }
+
+    fn write_to(&self, path : &path::Path) -> Result<(), anyhow::Error> {
+        Ok(fs::write(path, toml::to_string(self)?.as_bytes())?)
+    }
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct Episode<'a> {
+    /// Episode title.
+    title : Cow<'a, str>,
+    /// Show notes pulled from description or summary tag.
+    show_notes : Option<Cow<'a, str>>,
+    /// This is the GUID or the URL if the GUID is not present.
+    id : Cow<'a, str>,
+    /// If the episode exists in the latest version of the feed.
+    current : bool,
+}
+
+fn download_to_file(url : &str, path : &path::Path) -> anyhow::Result<()> {
+    let response = minreq::get(url)
+        .send()?;
+
+    if response.status_code == 200 {
+        fs::write(&path, response.as_bytes())?;
+    } else {
+        anyhow::bail!("request for episode resulted in non 200 ({}) response code", response.status_code)
+    }
+
+    Ok(())
+}
+
+pub (crate) fn update_podcast(
+    alias : &str,
+    root : &path::Path,
+    feed_location : &str,
+) -> anyhow::Result<()> {
+
+    // Create output directory
+    let output = root.join(sanitise(&alias));
+    if !output.exists() {
+        fs::create_dir(&output)
+            .with_context(|| format!("failed to create output directory for podcast {}", alias))?;
+    }
+
+    println!(r#"info: scanning feed for "{}""#, alias);
+
+    if feed_location.starts_with("http") {
+        let feed_url = feed_location;
+
+        // Get the podcast feed
+        let response = minreq::get(feed_url)
+            // For SquareSpace which refuses requests with no User-Agent
+            .with_header("User-Agent", "podcast-downloader")
+            .with_header("Accept", "*/*")
+            .send()
+            .with_context(|| format!(r#"error when requesting feed url "{}" for {}"#, feed_url, alias))?;
+
+        if response.status_code != 200 {
+            eprintln!(r#"error: feed "{}" for alias {} responded with non-200 ({}) status code"#, feed_url, alias, response.status_code);
+            return Ok(());
+        }
+
+        let feed = response.as_str()?.to_owned();
+        update_podcast_from_feed(&output, &feed)
+    } else {
+        let feed_path = root.join(feed_location);
+
+        match fs::read_to_string(&feed_path) {
+            Ok(feed) => update_podcast_from_feed(&output, &feed),
+            Err(err) => {
+                eprintln!(r#"error: failed to read path "{}" with error {}"#, feed_path.display(), err);
+                Ok(())
+            }
+        }
+    }
+}
+
+fn extract_extension_from_url(url : &str) -> Result<Option<String>, url::ParseError>  {
+    let mut url_edited = url::Url::parse(url)?;
+    url_edited.set_query(None);
+
+    match url_edited.as_str().rsplit_once('.') {
+        Some((_, extension)) => Ok(Some(extension.to_owned())),
+        None => Ok(None),
+    }
+}
+
+fn update_artwork<'a, 'b>(
+    channel : &rss::Channel<'a>,
+    spec : &mut Specification<'b>,
+    output : &path::Path,
+) -> anyhow::Result<()> where 'a : 'b {
+
+    let image_url = match (&channel.image, &channel.itunes_image) {
+        (Some(image), _) => Some(&image.url),
+        (_, Some(itunes_image)) => Some(&itunes_image.href),
+        _ => None,
+    };
+
+    match (&spec.image_url, image_url) {
+        // They match, so no need to change anything
+        (Some(old), Some(new)) if old == new => (),
+        // New and different URL
+        (_, Some(new)) => {
+
+            match extract_extension_from_url(new.as_ref()) {
+                Ok(Some(extension)) => {
+                    let cover_path = output.join(format!("cover.{}", extension));
+
+                    // Remove cover with conflicting file path if it exists
+                    if cover_path.exists() {
+                        fs::remove_file(&cover_path)?;
+                    }
+
+                    if let Err(err) = download_to_file(new.as_ref(), &cover_path) {
+                        eprintln!(r#"error: failed to download artwork with error "{}". skipping"#, err);
+                    }
+                },
+                Ok(None) => {
+                    println!(r#"warning: could not identify file type from url "{}" for podcast artwork "{}". skipping."#, new, channel.title);
+                }
+                Err(err) => {
+                    println!(r#"warning: failed to parse url "{}" for "{}" artwork with error: {}. skipping."#, new, channel.title, err);
+                },
+            };
+
+            spec.image_url = Some(new.clone());
+        },
+        _ => (),
+    }
+
+    Ok(())
+}
+
+
+pub (crate) fn update_podcast_from_feed(
+    output : &path::Path,
+    feed : &str,
+) -> anyhow::Result<()> {
+
+    let feed = match xml_serde::from_str::<rss::Feed>(&feed) {
+        Ok(feed) => feed,
+        Err(err) => {
+            eprintln!(r#"error: failed to parse rss feed with error: "{}""#, err);
+            return Ok(())
+        }
+    };
+
+    let channel = feed.rss.channel;
+
+    let spec_file = output.join("spec.toml");
+
+    let mut spec = Specification::read_from(&spec_file)?;
+
+    // Get set of all currently available episodes such that we can later mark
+    // any other episodes as unavailable
+    let current_episodes = {
+        let mut current_episodes = HashSet::new();
+        for episode in &channel.items {
+            let guid = episode.guid.clone().unwrap();
+            current_episodes.insert(guid);
+        }
+        current_episodes
+    };
+
+    update_artwork(
+        &channel,
+        &mut spec,
+        &output,
+    )?;
+
+    for item in channel.items {
+
+        let rss::Item {
+            title,
+            enclosure,
+            description,
+            summary,
+            guid,
+            ..
+        } = item;
+
+        let Some(enclosure) = enclosure else {
+            println!(r#"warning: episode "{}" does not have an enclosure tag. skipping."#, title);
+            continue;
+        };
+
+        let description = match (description, summary) {
+            (Some(a), _) => Some(a),
+            (_, Some(a)) => Some(a),
+            _ => None,
+        };
+
+        let guid = guid.as_deref();
+        let url = enclosure.url.as_ref();
+
+        let id = guid.unwrap_or(url);
+
+        match spec.files.get(id) {
+            // File already downloaded
+            Some(path) => {
+                // File has been deleted by another process but the specification hasn't been updated
+                // In this case we just redownload the file
+                // This gives an easy way to force a redownload
+                if !output.join(path).exists() {
+                    println!(r#"info: redownloading "{}" as the file seems to have been deleted"#, title);
+                    if let Err(err) = download_to_file(enclosure.url.as_ref(), path) {
+                        eprintln!(r#"error: failed to redownload new episode with error "{}". skipping"#, err);
+                        continue;
+                    }
+                }
+            },
+            None => {
+
+                let extension = match extract_extension_from_url(enclosure.url.as_ref()) {
+                    Ok(Some(extension)) => extension,
+                    Ok(None) => {
+                        println!(r#"warning: could not identify file type from url "{}" for episode "{}". skipping."#, url, title);
+                        continue;
+                    }
+                    Err(err) => {
+                        println!(r#"warning: failed to parse url "{}" for episode "{}" with error: {}. skipping."#, url, title, err);
+                        continue;
+                    },
+                };
+
+                let file_path = if ["mp3", "m4a", "ogg", "wav", "mp4", "m4v", "mov", "aiff"].contains(&&extension.to_lowercase()[..]) {
+                    output.join(format!("{}.{}", sanitise(&title), extension))
+                } else {
+                    println!("warning: unsupported file extension: {}. skipping.", extension);
+                    continue;
+                };
+
+                // The filename happens to exist despite the episode not being downloaded.
+                // In this case we need to construct a new filename by appending a digit to the end
+                let file_path = if file_path.exists() {
+                    increment_file_name(&file_path).into_owned()
+                } else { file_path };
+
+                println!(r#"info: downloading "{}" to "{}""#, title, file_path.display());
+
+                match download_to_file(enclosure.url.as_ref(), &file_path) {
+                    Ok(()) => {
+                        let file_path = file_path.canonicalize().unwrap();
+
+                        spec.files.insert(
+                            Cow::from(id.to_owned()),
+                            Cow::from(file_path.strip_prefix(&output).unwrap().to_owned()),
+                        );
+                        
+                        let episode = Episode {
+                            show_notes : description,
+                            id : Cow::from(id.to_owned()),
+                            current : true,
+                            title,
+                        };
+
+                        match spec.feed.get_mut(&item.published) {
+                            Some(existing) => {
+                                existing.push(episode)
+                            },
+                            None => {
+                                spec.feed.insert(
+                                    item.published,
+                                    vec![episode],
+                                );
+                            }
+                        }
+                    },
+                    Err(err) => {
+                        eprintln!(r#"error: failed to request episode "{}" with error "{}". skipping"#, title, err);
+                        continue;
+                    }
+                }
+
+                spec.write_to(&spec_file)?;
+            },
+        }
+    }
+
+    // Setting episodes which have been removed to no longer be current
+    for (_, episodes) in &mut spec.feed {
+        for episode in episodes {
+            if !current_episodes.contains(episode.id.as_ref()) {
+                episode.current = false;
+            }
+        }
+    }
+
+    spec.write_to(&spec_file)?;
+
+    Ok(())
+}
+
+/// Given a file path `something.xyz`, returns the first path of the form
+/// `something(a).xyz` where `a` is a non-negative integer which does not
+/// currently exist, or `something.xyz` if it itself does not exist.
+fn increment_file_name(path : &path::Path) -> Cow<'_, path::Path> {
+
+    if path.exists() {
+        let mut new_path = path.to_owned();
+
+        let mut i : u32 = 0;
+        while new_path.exists() {
+            let mut stem = path.file_stem().unwrap().to_owned();
+
+            let suffix = format!("({})", i);
+            stem.push(suffix);
+
+            new_path.set_file_name(stem);
+            if let Some(extension) = path.extension() {
+                new_path.set_extension(extension);
+            }
+
+            i += 1;
+        }
+
+        Cow::from(new_path)
+    }
+    // This case can easily be removed at the cost of an extra clone, however
+    // because this path is the most likely case by far and less computationally
+    // expensive, it is better to check first
+    else {
+        Cow::from(path)
+    }
+}
+
--- a/src/input.rs
+++ b/src/input.rs
@@ -0,0 +1,20 @@
+use std::path;
+use std::collections::HashMap;
+
+#[derive(clap::Parser)]
+pub (crate) struct Args {
+    /// Path to the configuration file listing podcast RSS feeds.
+    #[arg(default_value = "./podcasts.toml")]
+    pub (crate) config : path::PathBuf,
+    /// The podcast to update. Updates all in configuration file if unspecified.
+    #[arg(long, short)]
+    pub (crate) podcast : Option<String>,
+}
+
+/// Struct modelling configuration file format.
+#[derive(serde::Deserialize)]
+pub (crate) struct Config {
+    /// Map from podcast alias to RSS feed either as a url (prefix: http) or file path.
+    pub (crate) podcasts : HashMap<String, String>,
+}
+
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,47 @@
+mod rss;
+mod input;
+mod download;
+
+use std::fs;
+
+use anyhow::Context;
+
+fn main() -> anyhow::Result<()> {
+
+    let args = {
+        use clap::Parser;
+        input::Args::parse()
+    };
+
+    let config : input::Config = {
+        let config = fs::read_to_string(&args.config)
+            .with_context(|| "failed to read in podcast configuration file")?;
+
+        toml::from_str(&config[..])?
+    };
+
+    let config_path = args.config.canonicalize()?;
+    let Some(root) = config_path.parent() else {
+        anyhow::bail!("could not get parent of configuration path for root directory")
+    };
+
+    // Updating single podcast
+    if let Some(alias) = args.podcast {
+        if let Some(feed_url) = config.podcasts.get(&alias) {
+            download::update_podcast(&alias, root, feed_url)?;
+        }
+        else {
+            anyhow::bail!(r#"podcast "{}" not found in configuration file"#, alias)
+        }
+    }
+    // Updating all podcasts
+    else {
+        for (alias, feed_url) in config.podcasts {
+            download::update_podcast(&alias, root, &feed_url)?;
+        }
+    }
+
+    Ok(())
+}
+
+
--- a/src/rss.rs
+++ b/src/rss.rs
@@ -0,0 +1,91 @@
+#![allow(dead_code)]
+
+// See https://support.google.com/podcast-publishers/answer/9889544
+// for some reasonably guidelines for how a podcast RSS feed should look.
+
+use std::fmt;
+use std::borrow::Cow;
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Feed<'a> {
+    pub (crate) rss : Rss<'a>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Rss<'a> {
+    pub (crate) channel : Channel<'a>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Channel<'a> {
+    #[serde(rename = "item", default)]
+    pub (crate) items : Vec<Item<'a>>,
+    pub (crate) link : Cow<'a, str>,
+    pub (crate) title : Cow<'a, str>,
+    pub (crate) description : Option<Cow<'a, str>>,
+    #[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:author")]
+    pub (crate) author : Option<Cow<'a, str>>,
+    #[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:summary")]
+    pub (crate) summary : Option<Cow<'a, str>>,
+    #[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:image")]
+    pub (crate) itunes_image : Option<ItunesImage<'a>>,
+    pub (crate) image : Option<Image<'a>>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Image<'a> {
+    pub (crate) link : Cow<'a, str>,
+    pub (crate) title : Cow<'a, str>,
+    pub (crate) url : Cow<'a, str>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct ItunesImage<'a> {
+    #[serde(rename = "$attr:href")]
+    pub (crate) href : Cow<'a, str>,
+}
+
+fn deserialize_publish_date<'de, D : serde::de::Deserializer<'de>> (
+    deserializer : D
+) -> Result<chrono::NaiveDateTime, D::Error> {
+    struct Visitor;
+    impl<'de> serde::de::Visitor<'de> for Visitor {
+        type Value = chrono::NaiveDateTime;
+    
+        fn expecting(&self, formatter : &mut fmt::Formatter) -> fmt::Result {
+            formatter.write_str("a string containing json data")
+        }
+    
+        fn visit_str<E : serde::de::Error>(self, input : &str) -> Result<Self::Value, E> {
+            chrono::NaiveDateTime::parse_from_str(
+                input,
+                "%a, %d %b %Y %H:%M:%S %Z"
+            ).map_err(E::custom)
+        }
+    }
+    
+    deserializer.deserialize_any(Visitor)
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Item<'a> {
+    pub (crate) title : Cow<'a, str>,
+    pub (crate) enclosure : Option<Enclosure<'a>>,
+    pub (crate) description : Option<Cow<'a, str>>,
+    #[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:summary")]
+    pub (crate) summary : Option<Cow<'a, str>>,
+    #[serde(rename = "pubDate", deserialize_with = "deserialize_publish_date")]
+    pub (crate) published : chrono::NaiveDateTime,
+    pub (crate) guid : Option<Cow<'a, str>>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct Enclosure<'a> {
+    #[serde(rename = "$attr:url")]
+    pub (crate) url : Cow<'a, str>,
+    #[serde(rename = "$attr:type")]
+    pub (crate) mime_type : Option<Cow<'a, str>>,
+    #[serde(rename = "$attr:length")]
+    pub (crate) length : Option<u64>,
+}
+