Files
podcast-hoarder/src/download.rs
2025-09-01 09:42:15 +10:00

406 lines
13 KiB
Rust

use std::fs;
use std::path;
use std::borrow::Cow;
use std::iter::Iterator;
use std::collections::{HashMap, BTreeMap, HashSet};
use anyhow::Context;
use sanitise_file_name::sanitise;
use crate::folders;
use crate::rss;
#[derive(Debug, Default, serde::Serialize, serde::Deserialize)]
pub(crate) struct Specification<'a> {
files: HashMap<Cow<'a, str>, Cow<'a, path::Path>>,
/// This is a collection of episodes, where each entry contains a `Vec` of
/// episodes to allow for the possibility that multiple episodes have the
/// same timestamp.
feed: BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>,
image_url: Option<Cow<'a, str>>,
}
impl<'a> Specification<'a> {
pub(crate) fn read_from_with_default(path: &path::Path) -> Result<Self, anyhow::Error> {
Ok(if path.is_file() {
toml::from_str(&fs::read_to_string(&path)?[..])?
} else {
Specification::default()
})
}
pub(crate) fn read_from(path: &path::Path) -> Result<Self, anyhow::Error> {
Ok(if path.is_file() {
toml::from_str(&fs::read_to_string(&path)?[..])?
} else {
anyhow::bail!("could not find specification for the desired podcast")
})
}
pub(crate) fn write_to(&self, path: &path::Path) -> Result<(), anyhow::Error> {
Ok(fs::write(path, toml::to_string(self)?.as_bytes())?)
}
pub(crate) fn feed_iter(&self) -> impl Iterator<Item = (&chrono::NaiveDateTime, &Vec<Episode<'a>>)> {
self.feed.iter()
}
pub(crate) fn feed_iter_mut(&mut self) -> impl Iterator<Item = (&chrono::NaiveDateTime, &mut Vec<Episode<'a>>)> {
self.feed.iter_mut()
}
pub(crate) fn path_from_id(&self, id: &str) -> Option<&path::Path> {
self.files.get(id).map(|v| &**v)
}
pub(crate) fn feed(&self) -> &BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>> {
&self.feed
}
pub(crate) fn into_feed_and_files(self) -> (BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>, HashMap<Cow<'a, str>, Cow<'a, path::Path>>) {
(self.feed, self.files)
}
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub (crate) struct Episode<'a> {
/// Episode title.
title: Cow<'a, str>,
/// Show notes pulled from description or summary tag.
show_notes: Option<Cow<'a, str>>,
/// This is the GUID or the URL if the GUID is not present.
id: Cow<'a, str>,
/// If the episode exists in the latest version of the feed.
current: bool,
/// Flag to keep track of which episodes have been listened to.
#[serde(default)]
pub(crate) listened: bool,
}
impl<'a> Episode<'a> {
pub (crate) fn title(&self) -> &str {
self.title.as_ref()
}
pub(crate) fn id(&self) -> &str {
&self.id
}
}
fn download_to_file(url: &str, path: &path::Path) -> anyhow::Result<()> {
let response = minreq::get(url)
.send()?;
if response.status_code == 200 {
fs::write(&path, response.as_bytes())?;
} else {
anyhow::bail!("request for episode resulted in non 200 ({}) response code", response.status_code)
}
Ok(())
}
pub(crate) fn update_podcast(
alias: &str,
root: &path::Path,
feed_location: &str,
) -> anyhow::Result<()> {
// Create output directory
let output = folders::podcast_folder(root, alias);
if !output.exists() {
fs::create_dir_all(&output)
.context(format!("failed to create output directory for podcast {}", alias))?;
}
println!(r#"[info] scanning feed for "{}""#, alias);
if feed_location.starts_with("http") {
let feed_url = feed_location;
// Get the podcast feed
let response = minreq::get(feed_url)
// For SquareSpace which refuses requests with no User-Agent
.with_header("User-Agent", "podcast-downloader")
.with_header("Accept", "*/*")
.send()
.context(format!(r#"error when requesting feed url "{}" for {}"#, feed_url, alias))?;
if response.status_code != 200 {
eprintln!(r#"[error] feed "{}" for alias {} responded with non-200 ({}) status code"#, feed_url, alias, response.status_code);
return Ok(());
}
let feed = response.as_str()?.to_owned();
update_podcast_from_feed(&output, &feed)
} else {
let feed_path = root.join(feed_location);
match fs::read_to_string(&feed_path) {
Ok(feed) => update_podcast_from_feed(&output, &feed),
Err(err) => {
eprintln!(r#"[error] failed to read path "{}" with error {}"#, feed_path.display(), err);
Ok(())
}
}
}
}
fn extract_extension_from_url(url: &str) -> Result<Option<String>, url::ParseError> {
let mut url_edited = url::Url::parse(url)?;
url_edited.set_query(None);
match url_edited.as_str().rsplit_once('.') {
Some((_, extension)) => Ok(Some(extension.to_owned())),
None => Ok(None),
}
}
fn update_artwork<'a, 'b>(
channel: &rss::Channel<'a>,
spec: &mut Specification<'b>,
output: &path::Path,
) -> anyhow::Result<()> where 'a: 'b {
let image_url = match (&channel.image, &channel.itunes_image) {
(Some(image), _) => Some(&image.url),
(_, Some(itunes_image)) => Some(&itunes_image.href),
_ => None,
};
match (&spec.image_url, image_url) {
// They match, so no need to change anything
(Some(old), Some(new)) if old == new => (),
// New and different URL
(_, Some(new)) => {
match extract_extension_from_url(new.as_ref()) {
Ok(Some(extension)) => {
let cover_path = output.join(format!("cover-original.{}", extension));
// Remove cover with conflicting file path if it exists
if cover_path.exists() {
fs::remove_file(&cover_path)?;
}
if let Err(err) = download_to_file(new.as_ref(), &cover_path) {
eprintln!(r#"[error] failed to download artwork with error "{}". skipping"#, err);
}
},
Ok(None) => {
println!(r#"[warning] could not identify file type from url "{}" for podcast artwork "{}". skipping."#, new, channel.title);
}
Err(err) => {
println!(r#"[warning] failed to parse url "{}" for "{}" artwork with error: {}. skipping."#, new, channel.title, err);
},
};
spec.image_url = Some(new.clone());
},
_ => (),
}
Ok(())
}
pub(crate) fn update_podcast_from_feed(
output: &path::Path,
feed: &str,
) -> anyhow::Result<()> {
let feed = match xml_serde::from_str::<rss::Feed>(&feed) {
Ok(feed) => feed,
Err(err) => {
eprintln!(r#"[error] failed to parse rss feed with error: "{}""#, err);
return Ok(())
}
};
let channel = feed.rss.channel;
let spec_file = output.join("spec.toml");
let mut spec = Specification::read_from_with_default(&spec_file)?;
// Get set of all currently available episodes such that we can later mark
// any other episodes as unavailable
let current_episodes = {
let mut current_episodes = HashSet::new();
for episode in &channel.items {
let guid = episode.guid.clone().unwrap();
current_episodes.insert(guid);
}
current_episodes
};
update_artwork(
&channel,
&mut spec,
&output,
)?;
for item in channel.items {
let rss::Item {
title,
enclosure,
description,
summary,
guid,
..
} = item;
let Some(enclosure) = enclosure else {
println!(r#"[warning] episode "{}" does not have an enclosure tag. skipping."#, title);
continue;
};
let description = match (description, summary) {
(Some(a), _) => Some(a),
(_, Some(a)) => Some(a),
_ => None,
};
let guid = guid.as_deref();
let url = enclosure.url.as_ref();
let id = guid.unwrap_or(url);
match spec.files.get(id) {
// File already downloaded
Some(path) => {
// File has been deleted by another process but the specification hasn't been updated
// In this case we just redownload the file
// This gives an easy way to force a redownload
if !output.join(path).exists() {
println!(r#"[info] redownloading "{}" as the file seems to have been deleted"#, title);
if let Err(err) = download_to_file(enclosure.url.as_ref(), path) {
eprintln!(r#"[error] failed to redownload new episode with error: "{}". skipping"#, err);
continue;
}
}
},
None => {
let extension = match extract_extension_from_url(enclosure.url.as_ref()) {
Ok(Some(extension)) => extension,
Ok(None) => {
println!(r#"[warning] could not identify file type from url "{}" for episode "{}". skipping."#, url, title);
continue;
}
Err(err) => {
println!(r#"[warning] failed to parse url "{}" for episode "{}" with error: {}. skipping."#, url, title, err);
continue;
},
};
let file_path = if ["mp3", "m4a", "ogg", "wav", "mp4", "m4v", "mov", "aiff"].contains(&&extension.to_lowercase()[..]) {
output.join(format!("{}.{}", sanitise(&title), extension))
} else {
println!("[warning] unsupported file extension: {}. skipping.", extension);
continue;
};
// The filename happens to exist despite the episode not being downloaded.
// In this case we need to construct a new filename by appending a digit to the end
let file_path = if file_path.exists() {
increment_file_name(&file_path).into_owned()
} else { file_path };
println!(r#"[info] downloading "{}""#, title);
match download_to_file(enclosure.url.as_ref(), &file_path) {
Ok(()) => {
let file_path = file_path.canonicalize().unwrap();
spec.files.insert(
Cow::from(id.to_owned()),
Cow::from(file_path.strip_prefix(&output).unwrap().to_owned()),
);
let episode = Episode {
show_notes: description,
id: Cow::from(id.to_owned()),
current: true,
title,
listened: false,
};
match spec.feed.get_mut(&item.published) {
Some(existing) => {
existing.push(episode)
},
None => {
spec.feed.insert(
item.published,
vec![episode],
);
}
}
// Update the file as we go, but only if a change has occured
spec.write_to(&spec_file)?;
},
Err(err) => {
eprintln!(r#"[error] failed to request episode "{}" with error: "{}". skipping"#, title, err);
continue;
}
}
},
}
}
let mut feed_change = false;
// Setting episodes which have been removed to no longer be current
for (_, episodes) in &mut spec.feed {
for episode in episodes {
if !current_episodes.contains(episode.id.as_ref()) {
episode.current = false;
feed_change = true;
}
}
}
if feed_change {
spec.write_to(&spec_file)?;
}
Ok(())
}
/// Given a file path `something.xyz`, returns the first path of the form
/// `something(a).xyz` where `a` is a non-negative integer which does not
/// currently exist, or `something.xyz` if it itself does not exist.
fn increment_file_name(path: &path::Path) -> Cow<'_, path::Path> {
if path.exists() {
let mut new_path = path.to_owned();
let mut i: u32 = 0;
while new_path.exists() {
let mut stem = path.file_stem().unwrap().to_owned();
let suffix = format!("({})", i);
stem.push(suffix);
new_path.set_file_name(stem);
if let Some(extension) = path.extension() {
new_path.set_extension(extension);
}
i += 1;
}
Cow::from(new_path)
}
// This case can easily be removed at the cost of an extra clone, however
// because this path is the most likely case by far and less computationally
// expensive, it is better to check first
else {
Cow::from(path)
}
}