podcast-hoarder/src/download.rs

use std::fs;
use std::path;
use std::borrow::Cow;
use std::iter::Iterator;
use std::collections::{HashMap, BTreeMap, HashSet};

use anyhow::Context;
use sanitise_file_name::sanitise;

use crate::folders;
use crate::rss;


#[derive(Debug, Default, serde::Serialize, serde::Deserialize)]
pub(crate) struct Specification<'a> {
    files: HashMap<Cow<'a, str>, Cow<'a, path::Path>>,
    /// This is a collection of episodes, where each entry contains a `Vec` of
    /// episodes to allow for the possibility that multiple episodes have the
    /// same timestamp.
    feed: BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>,
    image_url: Option<Cow<'a, str>>,
}

impl<'a> Specification<'a> {
    pub(crate) fn read_from_with_default(path: &path::Path) -> Result<Self, anyhow::Error> {
        Ok(if path.is_file() {
            toml::from_str(&fs::read_to_string(&path)?[..])?
        } else {
            Specification::default()
        })
    }

    pub(crate) fn read_from(path: &path::Path) -> Result<Self, anyhow::Error> {
        Ok(if path.is_file() {
            toml::from_str(&fs::read_to_string(&path)?[..])?
        } else {
            anyhow::bail!("could not find specification for the desired podcast")
        })
    }

    pub(crate) fn write_to(&self, path: &path::Path) -> Result<(), anyhow::Error> {
        Ok(fs::write(path, toml::to_string(self)?.as_bytes())?)
    }

    pub(crate) fn feed_iter(&self) -> impl Iterator<Item = (&chrono::NaiveDateTime, &Vec<Episode<'a>>)> {
        self.feed.iter()
    }

    pub(crate) fn feed_iter_mut(&mut self) -> impl Iterator<Item = (&chrono::NaiveDateTime, &mut Vec<Episode<'a>>)> {
        self.feed.iter_mut()
    }

    pub(crate) fn path_from_id(&self, id: &str) -> Option<&path::Path> {
        self.files.get(id).map(|v| &**v)
    }

    pub(crate) fn feed(&self) -> &BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>> {
        &self.feed
    }

    pub(crate) fn into_feed_and_files(self) -> (BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>, HashMap<Cow<'a, str>, Cow<'a, path::Path>>) {
        (self.feed, self.files)
    }
}

#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub (crate) struct Episode<'a> {
    /// Episode title.
    title: Cow<'a, str>,
    /// Show notes pulled from description or summary tag.
    show_notes: Option<Cow<'a, str>>,
    /// This is the GUID or the URL if the GUID is not present.
    id: Cow<'a, str>,
    /// If the episode exists in the latest version of the feed.
    current: bool,
    /// Flag to keep track of which episodes have been listened to.
    #[serde(default)]
    pub(crate) listened: bool,
}

impl<'a> Episode<'a> {
    pub (crate) fn title(&self) -> &str {
        self.title.as_ref()
    }

    pub(crate) fn id(&self) -> &str {
        &self.id
    }
}

fn download_to_file(url: &str, path: &path::Path) -> anyhow::Result<()> {
    let response = minreq::get(url)
        .send()?;

    if response.status_code == 200 {
        fs::write(&path, response.as_bytes())?;
    } else {
        anyhow::bail!("request for episode resulted in non 200 ({}) response code", response.status_code)
    }

    Ok(())
}

pub(crate) fn update_podcast(
    alias: &str,
    root: &path::Path,
    feed_location: &str,
) -> anyhow::Result<()> {

    // Create output directory
    let output = folders::podcast_folder(root, alias);
    if !output.exists() {
        fs::create_dir_all(&output)
            .context(format!("failed to create output directory for podcast {}", alias))?;
    }

    println!(r#"[info] scanning feed for "{}""#, alias);

    if feed_location.starts_with("http") {
        let feed_url = feed_location;

        // Get the podcast feed
        let response = minreq::get(feed_url)
            // For SquareSpace which refuses requests with no User-Agent
            .with_header("User-Agent", "podcast-downloader")
            .with_header("Accept", "*/*")
            .send()
            .context(format!(r#"error when requesting feed url "{}" for {}"#, feed_url, alias))?;

        if response.status_code != 200 {
            eprintln!(r#"[error] feed "{}" for alias {} responded with non-200 ({}) status code"#, feed_url, alias, response.status_code);
            return Ok(());
        }

        let feed = response.as_str()?.to_owned();
        update_podcast_from_feed(&output, &feed)
    } else {
        let feed_path = root.join(feed_location);

        match fs::read_to_string(&feed_path) {
            Ok(feed) => update_podcast_from_feed(&output, &feed),
            Err(err) => {
                eprintln!(r#"[error] failed to read path "{}" with error {}"#, feed_path.display(), err);
                Ok(())
            }
        }
    }
}

fn extract_extension_from_url(url: &str) -> Result<Option<String>, url::ParseError>  {
    let mut url_edited = url::Url::parse(url)?;
    url_edited.set_query(None);

    match url_edited.as_str().rsplit_once('.') {
        Some((_, extension)) => Ok(Some(extension.to_owned())),
        None => Ok(None),
    }
}

fn update_artwork<'a, 'b>(
    channel: &rss::Channel<'a>,
    spec: &mut Specification<'b>,
    output: &path::Path,
) -> anyhow::Result<()> where 'a: 'b {

    let image_url = match (&channel.image, &channel.itunes_image) {
        (Some(image), _) => Some(&image.url),
        (_, Some(itunes_image)) => Some(&itunes_image.href),
        _ => None,
    };

    match (&spec.image_url, image_url) {
        // They match, so no need to change anything
        (Some(old), Some(new)) if old == new => (),
        // New and different URL
        (_, Some(new)) => {

            match extract_extension_from_url(new.as_ref()) {
                Ok(Some(extension)) => {
                    let cover_path = output.join(format!("cover-original.{}", extension));

                    // Remove cover with conflicting file path if it exists
                    if cover_path.exists() {
                        fs::remove_file(&cover_path)?;
                    }

                    if let Err(err) = download_to_file(new.as_ref(), &cover_path) {
                        eprintln!(r#"[error] failed to download artwork with error "{}". skipping"#, err);
                    }
                },
                Ok(None) => {
                    println!(r#"[warning] could not identify file type from url "{}" for podcast artwork "{}". skipping."#, new, channel.title);
                }
                Err(err) => {
                    println!(r#"[warning] failed to parse url "{}" for "{}" artwork with error: {}. skipping."#, new, channel.title, err);
                },
            };

            spec.image_url = Some(new.clone());
        },
        _ => (),
    }

    Ok(())
}


pub(crate) fn update_podcast_from_feed(
    output: &path::Path,
    feed: &str,
) -> anyhow::Result<()> {

    let feed = match xml_serde::from_str::<rss::Feed>(&feed) {
        Ok(feed) => feed,
        Err(err) => {
            eprintln!(r#"[error] failed to parse rss feed with error: "{}""#, err);
            return Ok(())
        }
    };

    let channel = feed.rss.channel;

    let spec_file = output.join("spec.toml");

    let mut spec = Specification::read_from_with_default(&spec_file)?;

    // Get set of all currently available episodes such that we can later mark
    // any other episodes as unavailable
    let current_episodes = {
        let mut current_episodes = HashSet::new();
        for episode in &channel.items {
            let guid = episode.guid.clone().unwrap();
            current_episodes.insert(guid);
        }
        current_episodes
    };

    update_artwork(
        &channel,
        &mut spec,
        &output,
    )?;

    for item in channel.items {

        let rss::Item {
            title,
            enclosure,
            description,
            summary,
            guid,
            ..
        } = item;

        let Some(enclosure) = enclosure else {
            println!(r#"[warning] episode "{}" does not have an enclosure tag. skipping."#, title);
            continue;
        };

        let description = match (description, summary) {
            (Some(a), _) => Some(a),
            (_, Some(a)) => Some(a),
            _ => None,
        };

        let guid = guid.as_deref();
        let url = enclosure.url.as_ref();

        let id = guid.unwrap_or(url);

        match spec.files.get(id) {
            // File already downloaded
            Some(path) => {
                // File has been deleted by another process but the specification hasn't been updated
                // In this case we just redownload the file
                // This gives an easy way to force a redownload
                if !output.join(path).exists() {
                    println!(r#"[info] redownloading "{}" as the file seems to have been deleted"#, title);
                    if let Err(err) = download_to_file(enclosure.url.as_ref(), path) {
                        eprintln!(r#"[error] failed to redownload new episode with error: "{}". skipping"#, err);
                        continue;
                    }
                }
            },
            None => {

                let extension = match extract_extension_from_url(enclosure.url.as_ref()) {
                    Ok(Some(extension)) => extension,
                    Ok(None) => {
                        println!(r#"[warning] could not identify file type from url "{}" for episode "{}". skipping."#, url, title);
                        continue;
                    }
                    Err(err) => {
                        println!(r#"[warning] failed to parse url "{}" for episode "{}" with error: {}. skipping."#, url, title, err);
                        continue;
                    },
                };

                let file_path = if ["mp3", "m4a", "ogg", "wav", "mp4", "m4v", "mov", "aiff"].contains(&&extension.to_lowercase()[..]) {
                    output.join(format!("{}.{}", sanitise(&title), extension))
                } else {
                    println!("[warning] unsupported file extension: {}. skipping.", extension);
                    continue;
                };

                // The filename happens to exist despite the episode not being downloaded.
                // In this case we need to construct a new filename by appending a digit to the end
                let file_path = if file_path.exists() {
                    increment_file_name(&file_path).into_owned()
                } else { file_path };

                println!(r#"[info] downloading "{}""#, title);

                match download_to_file(enclosure.url.as_ref(), &file_path) {
                    Ok(()) => {
                        let file_path = file_path.canonicalize().unwrap();

                        spec.files.insert(
                            Cow::from(id.to_owned()),
                            Cow::from(file_path.strip_prefix(&output).unwrap().to_owned()),
                        );

                        let episode = Episode {
                            show_notes: description,
                            id: Cow::from(id.to_owned()),
                            current: true,
                            title,
                            listened: false,
                        };

                        match spec.feed.get_mut(&item.published) {
                            Some(existing) => {
                                existing.push(episode)
                            },
                            None => {
                                spec.feed.insert(
                                    item.published,
                                    vec![episode],
                                );
                            }
                        }

                        // Update the file as we go, but only if a change has occured
                        spec.write_to(&spec_file)?;
                    },
                    Err(err) => {
                        eprintln!(r#"[error] failed to request episode "{}" with error: "{}". skipping"#, title, err);
                        continue;
                    }
                }
            },
        }
    }

    let mut feed_change = false;
    // Setting episodes which have been removed to no longer be current
    for (_, episodes) in &mut spec.feed {
        for episode in episodes {
            if !current_episodes.contains(episode.id.as_ref()) {
                episode.current = false;
                feed_change = true;
            }
        }
    }

    if feed_change {
        spec.write_to(&spec_file)?;
    }

    Ok(())
}

/// Given a file path `something.xyz`, returns the first path of the form
/// `something(a).xyz` where `a` is a non-negative integer which does not
/// currently exist, or `something.xyz` if it itself does not exist.
fn increment_file_name(path: &path::Path) -> Cow<'_, path::Path> {

    if path.exists() {
        let mut new_path = path.to_owned();

        let mut i: u32 = 0;
        while new_path.exists() {
            let mut stem = path.file_stem().unwrap().to_owned();

            let suffix = format!("({})", i);
            stem.push(suffix);

            new_path.set_file_name(stem);
            if let Some(extension) = path.extension() {
                new_path.set_extension(extension);
            }

            i += 1;
        }

        Cow::from(new_path)
    }
    // This case can easily be removed at the cost of an extra clone, however
    // because this path is the most likely case by far and less computationally
    // expensive, it is better to check first
    else {
        Cow::from(path)
    }
}