initial commit
This commit is contained in:
352
src/download.rs
Normal file
352
src/download.rs
Normal file
@ -0,0 +1,352 @@
|
||||
use std::fs;
|
||||
use std::path;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, BTreeMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use sanitise_file_name::sanitise;
|
||||
|
||||
use crate::rss;
|
||||
|
||||
#[derive(Default, serde::Serialize, serde::Deserialize)]
|
||||
struct Specification<'a> {
|
||||
files : HashMap<Cow<'a, str>, Cow<'a, path::Path>>,
|
||||
feed : BTreeMap<chrono::NaiveDateTime, Vec<Episode<'a>>>,
|
||||
image_url : Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
impl<'a> Specification<'a> {
|
||||
fn read_from(path : &path::Path) -> Result<Self, anyhow::Error> {
|
||||
Ok(if path.is_file() {
|
||||
toml::from_str(&fs::read_to_string(&path)?[..])?
|
||||
} else {
|
||||
Specification::default()
|
||||
})
|
||||
}
|
||||
|
||||
fn write_to(&self, path : &path::Path) -> Result<(), anyhow::Error> {
|
||||
Ok(fs::write(path, toml::to_string(self)?.as_bytes())?)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct Episode<'a> {
|
||||
/// Episode title.
|
||||
title : Cow<'a, str>,
|
||||
/// Show notes pulled from description or summary tag.
|
||||
show_notes : Option<Cow<'a, str>>,
|
||||
/// This is the GUID or the URL if the GUID is not present.
|
||||
id : Cow<'a, str>,
|
||||
/// If the episode exists in the latest version of the feed.
|
||||
current : bool,
|
||||
}
|
||||
|
||||
fn download_to_file(url : &str, path : &path::Path) -> anyhow::Result<()> {
|
||||
let response = minreq::get(url)
|
||||
.send()?;
|
||||
|
||||
if response.status_code == 200 {
|
||||
fs::write(&path, response.as_bytes())?;
|
||||
} else {
|
||||
anyhow::bail!("request for episode resulted in non 200 ({}) response code", response.status_code)
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub (crate) fn update_podcast(
|
||||
alias : &str,
|
||||
root : &path::Path,
|
||||
feed_location : &str,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
// Create output directory
|
||||
let output = root.join(sanitise(&alias));
|
||||
if !output.exists() {
|
||||
fs::create_dir(&output)
|
||||
.with_context(|| format!("failed to create output directory for podcast {}", alias))?;
|
||||
}
|
||||
|
||||
println!(r#"info: scanning feed for "{}""#, alias);
|
||||
|
||||
if feed_location.starts_with("http") {
|
||||
let feed_url = feed_location;
|
||||
|
||||
// Get the podcast feed
|
||||
let response = minreq::get(feed_url)
|
||||
// For SquareSpace which refuses requests with no User-Agent
|
||||
.with_header("User-Agent", "podcast-downloader")
|
||||
.with_header("Accept", "*/*")
|
||||
.send()
|
||||
.with_context(|| format!(r#"error when requesting feed url "{}" for {}"#, feed_url, alias))?;
|
||||
|
||||
if response.status_code != 200 {
|
||||
eprintln!(r#"error: feed "{}" for alias {} responded with non-200 ({}) status code"#, feed_url, alias, response.status_code);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let feed = response.as_str()?.to_owned();
|
||||
update_podcast_from_feed(&output, &feed)
|
||||
} else {
|
||||
let feed_path = root.join(feed_location);
|
||||
|
||||
match fs::read_to_string(&feed_path) {
|
||||
Ok(feed) => update_podcast_from_feed(&output, &feed),
|
||||
Err(err) => {
|
||||
eprintln!(r#"error: failed to read path "{}" with error {}"#, feed_path.display(), err);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_extension_from_url(url : &str) -> Result<Option<String>, url::ParseError> {
|
||||
let mut url_edited = url::Url::parse(url)?;
|
||||
url_edited.set_query(None);
|
||||
|
||||
match url_edited.as_str().rsplit_once('.') {
|
||||
Some((_, extension)) => Ok(Some(extension.to_owned())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn update_artwork<'a, 'b>(
|
||||
channel : &rss::Channel<'a>,
|
||||
spec : &mut Specification<'b>,
|
||||
output : &path::Path,
|
||||
) -> anyhow::Result<()> where 'a : 'b {
|
||||
|
||||
let image_url = match (&channel.image, &channel.itunes_image) {
|
||||
(Some(image), _) => Some(&image.url),
|
||||
(_, Some(itunes_image)) => Some(&itunes_image.href),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
match (&spec.image_url, image_url) {
|
||||
// They match, so no need to change anything
|
||||
(Some(old), Some(new)) if old == new => (),
|
||||
// New and different URL
|
||||
(_, Some(new)) => {
|
||||
|
||||
match extract_extension_from_url(new.as_ref()) {
|
||||
Ok(Some(extension)) => {
|
||||
let cover_path = output.join(format!("cover.{}", extension));
|
||||
|
||||
// Remove cover with conflicting file path if it exists
|
||||
if cover_path.exists() {
|
||||
fs::remove_file(&cover_path)?;
|
||||
}
|
||||
|
||||
if let Err(err) = download_to_file(new.as_ref(), &cover_path) {
|
||||
eprintln!(r#"error: failed to download artwork with error "{}". skipping"#, err);
|
||||
}
|
||||
},
|
||||
Ok(None) => {
|
||||
println!(r#"warning: could not identify file type from url "{}" for podcast artwork "{}". skipping."#, new, channel.title);
|
||||
}
|
||||
Err(err) => {
|
||||
println!(r#"warning: failed to parse url "{}" for "{}" artwork with error: {}. skipping."#, new, channel.title, err);
|
||||
},
|
||||
};
|
||||
|
||||
spec.image_url = Some(new.clone());
|
||||
},
|
||||
_ => (),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub (crate) fn update_podcast_from_feed(
|
||||
output : &path::Path,
|
||||
feed : &str,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
let feed = match xml_serde::from_str::<rss::Feed>(&feed) {
|
||||
Ok(feed) => feed,
|
||||
Err(err) => {
|
||||
eprintln!(r#"error: failed to parse rss feed with error: "{}""#, err);
|
||||
return Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
let channel = feed.rss.channel;
|
||||
|
||||
let spec_file = output.join("spec.toml");
|
||||
|
||||
let mut spec = Specification::read_from(&spec_file)?;
|
||||
|
||||
// Get set of all currently available episodes such that we can later mark
|
||||
// any other episodes as unavailable
|
||||
let current_episodes = {
|
||||
let mut current_episodes = HashSet::new();
|
||||
for episode in &channel.items {
|
||||
let guid = episode.guid.clone().unwrap();
|
||||
current_episodes.insert(guid);
|
||||
}
|
||||
current_episodes
|
||||
};
|
||||
|
||||
update_artwork(
|
||||
&channel,
|
||||
&mut spec,
|
||||
&output,
|
||||
)?;
|
||||
|
||||
for item in channel.items {
|
||||
|
||||
let rss::Item {
|
||||
title,
|
||||
enclosure,
|
||||
description,
|
||||
summary,
|
||||
guid,
|
||||
..
|
||||
} = item;
|
||||
|
||||
let Some(enclosure) = enclosure else {
|
||||
println!(r#"warning: episode "{}" does not have an enclosure tag. skipping."#, title);
|
||||
continue;
|
||||
};
|
||||
|
||||
let description = match (description, summary) {
|
||||
(Some(a), _) => Some(a),
|
||||
(_, Some(a)) => Some(a),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let guid = guid.as_deref();
|
||||
let url = enclosure.url.as_ref();
|
||||
|
||||
let id = guid.unwrap_or(url);
|
||||
|
||||
match spec.files.get(id) {
|
||||
// File already downloaded
|
||||
Some(path) => {
|
||||
// File has been deleted by another process but the specification hasn't been updated
|
||||
// In this case we just redownload the file
|
||||
// This gives an easy way to force a redownload
|
||||
if !output.join(path).exists() {
|
||||
println!(r#"info: redownloading "{}" as the file seems to have been deleted"#, title);
|
||||
if let Err(err) = download_to_file(enclosure.url.as_ref(), path) {
|
||||
eprintln!(r#"error: failed to redownload new episode with error "{}". skipping"#, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
},
|
||||
None => {
|
||||
|
||||
let extension = match extract_extension_from_url(enclosure.url.as_ref()) {
|
||||
Ok(Some(extension)) => extension,
|
||||
Ok(None) => {
|
||||
println!(r#"warning: could not identify file type from url "{}" for episode "{}". skipping."#, url, title);
|
||||
continue;
|
||||
}
|
||||
Err(err) => {
|
||||
println!(r#"warning: failed to parse url "{}" for episode "{}" with error: {}. skipping."#, url, title, err);
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
let file_path = if ["mp3", "m4a", "ogg", "wav", "mp4", "m4v", "mov", "aiff"].contains(&&extension.to_lowercase()[..]) {
|
||||
output.join(format!("{}.{}", sanitise(&title), extension))
|
||||
} else {
|
||||
println!("warning: unsupported file extension: {}. skipping.", extension);
|
||||
continue;
|
||||
};
|
||||
|
||||
// The filename happens to exist despite the episode not being downloaded.
|
||||
// In this case we need to construct a new filename by appending a digit to the end
|
||||
let file_path = if file_path.exists() {
|
||||
increment_file_name(&file_path).into_owned()
|
||||
} else { file_path };
|
||||
|
||||
println!(r#"info: downloading "{}" to "{}""#, title, file_path.display());
|
||||
|
||||
match download_to_file(enclosure.url.as_ref(), &file_path) {
|
||||
Ok(()) => {
|
||||
let file_path = file_path.canonicalize().unwrap();
|
||||
|
||||
spec.files.insert(
|
||||
Cow::from(id.to_owned()),
|
||||
Cow::from(file_path.strip_prefix(&output).unwrap().to_owned()),
|
||||
);
|
||||
|
||||
let episode = Episode {
|
||||
show_notes : description,
|
||||
id : Cow::from(id.to_owned()),
|
||||
current : true,
|
||||
title,
|
||||
};
|
||||
|
||||
match spec.feed.get_mut(&item.published) {
|
||||
Some(existing) => {
|
||||
existing.push(episode)
|
||||
},
|
||||
None => {
|
||||
spec.feed.insert(
|
||||
item.published,
|
||||
vec![episode],
|
||||
);
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
eprintln!(r#"error: failed to request episode "{}" with error "{}". skipping"#, title, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
spec.write_to(&spec_file)?;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Setting episodes which have been removed to no longer be current
|
||||
for (_, episodes) in &mut spec.feed {
|
||||
for episode in episodes {
|
||||
if !current_episodes.contains(episode.id.as_ref()) {
|
||||
episode.current = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
spec.write_to(&spec_file)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Given a file path `something.xyz`, returns the first path of the form
|
||||
/// `something(a).xyz` where `a` is a non-negative integer which does not
|
||||
/// currently exist, or `something.xyz` if it itself does not exist.
|
||||
fn increment_file_name(path : &path::Path) -> Cow<'_, path::Path> {
|
||||
|
||||
if path.exists() {
|
||||
let mut new_path = path.to_owned();
|
||||
|
||||
let mut i : u32 = 0;
|
||||
while new_path.exists() {
|
||||
let mut stem = path.file_stem().unwrap().to_owned();
|
||||
|
||||
let suffix = format!("({})", i);
|
||||
stem.push(suffix);
|
||||
|
||||
new_path.set_file_name(stem);
|
||||
if let Some(extension) = path.extension() {
|
||||
new_path.set_extension(extension);
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
|
||||
Cow::from(new_path)
|
||||
}
|
||||
// This case can easily be removed at the cost of an extra clone, however
|
||||
// because this path is the most likely case by far and less computationally
|
||||
// expensive, it is better to check first
|
||||
else {
|
||||
Cow::from(path)
|
||||
}
|
||||
}
|
||||
|
20
src/input.rs
Normal file
20
src/input.rs
Normal file
@ -0,0 +1,20 @@
|
||||
use std::path;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
pub (crate) struct Args {
|
||||
/// Path to the configuration file listing podcast RSS feeds.
|
||||
#[arg(default_value = "./podcasts.toml")]
|
||||
pub (crate) config : path::PathBuf,
|
||||
/// The podcast to update. Updates all in configuration file if unspecified.
|
||||
#[arg(long, short)]
|
||||
pub (crate) podcast : Option<String>,
|
||||
}
|
||||
|
||||
/// Struct modelling configuration file format.
|
||||
#[derive(serde::Deserialize)]
|
||||
pub (crate) struct Config {
|
||||
/// Map from podcast alias to RSS feed either as a url (prefix: http) or file path.
|
||||
pub (crate) podcasts : HashMap<String, String>,
|
||||
}
|
||||
|
47
src/main.rs
Normal file
47
src/main.rs
Normal file
@ -0,0 +1,47 @@
|
||||
mod rss;
|
||||
mod input;
|
||||
mod download;
|
||||
|
||||
use std::fs;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
|
||||
let args = {
|
||||
use clap::Parser;
|
||||
input::Args::parse()
|
||||
};
|
||||
|
||||
let config : input::Config = {
|
||||
let config = fs::read_to_string(&args.config)
|
||||
.with_context(|| "failed to read in podcast configuration file")?;
|
||||
|
||||
toml::from_str(&config[..])?
|
||||
};
|
||||
|
||||
let config_path = args.config.canonicalize()?;
|
||||
let Some(root) = config_path.parent() else {
|
||||
anyhow::bail!("could not get parent of configuration path for root directory")
|
||||
};
|
||||
|
||||
// Updating single podcast
|
||||
if let Some(alias) = args.podcast {
|
||||
if let Some(feed_url) = config.podcasts.get(&alias) {
|
||||
download::update_podcast(&alias, root, feed_url)?;
|
||||
}
|
||||
else {
|
||||
anyhow::bail!(r#"podcast "{}" not found in configuration file"#, alias)
|
||||
}
|
||||
}
|
||||
// Updating all podcasts
|
||||
else {
|
||||
for (alias, feed_url) in config.podcasts {
|
||||
download::update_podcast(&alias, root, &feed_url)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
91
src/rss.rs
Normal file
91
src/rss.rs
Normal file
@ -0,0 +1,91 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
// See https://support.google.com/podcast-publishers/answer/9889544
|
||||
// for some reasonably guidelines for how a podcast RSS feed should look.
|
||||
|
||||
use std::fmt;
|
||||
use std::borrow::Cow;
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Feed<'a> {
|
||||
pub (crate) rss : Rss<'a>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Rss<'a> {
|
||||
pub (crate) channel : Channel<'a>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Channel<'a> {
|
||||
#[serde(rename = "item", default)]
|
||||
pub (crate) items : Vec<Item<'a>>,
|
||||
pub (crate) link : Cow<'a, str>,
|
||||
pub (crate) title : Cow<'a, str>,
|
||||
pub (crate) description : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:author")]
|
||||
pub (crate) author : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:summary")]
|
||||
pub (crate) summary : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:image")]
|
||||
pub (crate) itunes_image : Option<ItunesImage<'a>>,
|
||||
pub (crate) image : Option<Image<'a>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Image<'a> {
|
||||
pub (crate) link : Cow<'a, str>,
|
||||
pub (crate) title : Cow<'a, str>,
|
||||
pub (crate) url : Cow<'a, str>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct ItunesImage<'a> {
|
||||
#[serde(rename = "$attr:href")]
|
||||
pub (crate) href : Cow<'a, str>,
|
||||
}
|
||||
|
||||
fn deserialize_publish_date<'de, D : serde::de::Deserializer<'de>> (
|
||||
deserializer : D
|
||||
) -> Result<chrono::NaiveDateTime, D::Error> {
|
||||
struct Visitor;
|
||||
impl<'de> serde::de::Visitor<'de> for Visitor {
|
||||
type Value = chrono::NaiveDateTime;
|
||||
|
||||
fn expecting(&self, formatter : &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a string containing json data")
|
||||
}
|
||||
|
||||
fn visit_str<E : serde::de::Error>(self, input : &str) -> Result<Self::Value, E> {
|
||||
chrono::NaiveDateTime::parse_from_str(
|
||||
input,
|
||||
"%a, %d %b %Y %H:%M:%S %Z"
|
||||
).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(Visitor)
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Item<'a> {
|
||||
pub (crate) title : Cow<'a, str>,
|
||||
pub (crate) enclosure : Option<Enclosure<'a>>,
|
||||
pub (crate) description : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "{http://www.itunes.com/dtds/podcast-1.0.dtd}itunes:summary")]
|
||||
pub (crate) summary : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "pubDate", deserialize_with = "deserialize_publish_date")]
|
||||
pub (crate) published : chrono::NaiveDateTime,
|
||||
pub (crate) guid : Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
pub struct Enclosure<'a> {
|
||||
#[serde(rename = "$attr:url")]
|
||||
pub (crate) url : Cow<'a, str>,
|
||||
#[serde(rename = "$attr:type")]
|
||||
pub (crate) mime_type : Option<Cow<'a, str>>,
|
||||
#[serde(rename = "$attr:length")]
|
||||
pub (crate) length : Option<u64>,
|
||||
}
|
||||
|
Reference in New Issue
Block a user