i know wpflare likes being touched by a cf bypass, thats why they dont pull thier panties back up(go on im under attack mode)
This commit is contained in:
parent
6b57991e6c
commit
a4f0265fdc
520
src/main.rs
Normal file
520
src/main.rs
Normal file
|
|
@ -0,0 +1,520 @@
|
||||||
|
mod wallpapersclan;
|
||||||
|
mod wallpaperflare;
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
const CDN_BASE: &str = "https://raw.githubusercontent.com/yapude/wallpapers/main/assets";
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicU32, Ordering};
|
||||||
|
use tokio::sync::{Mutex, Semaphore};
|
||||||
|
|
||||||
|
// global stats that all tasks can update without locking
|
||||||
|
struct Stats {
|
||||||
|
downloaded: AtomicU32,
|
||||||
|
skipped: AtomicU32,
|
||||||
|
failed: AtomicU32,
|
||||||
|
pushed: AtomicU32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Stats {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
downloaded: AtomicU32::new(0),
|
||||||
|
skipped: AtomicU32::new(0),
|
||||||
|
failed: AtomicU32::new(0),
|
||||||
|
pushed: AtomicU32::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get disk usage stats for the runner
|
||||||
|
fn get_disk_usage() -> String {
|
||||||
|
let output_dir = Path::new("assets");
|
||||||
|
let mut total_bytes: u64 = 0;
|
||||||
|
let mut file_count: u64 = 0;
|
||||||
|
if let Ok(entries) = std::fs::read_dir(output_dir) {
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
if let Ok(meta) = entry.metadata() {
|
||||||
|
if meta.is_file() {
|
||||||
|
total_bytes += meta.len();
|
||||||
|
file_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mb = total_bytes / (1024 * 1024);
|
||||||
|
format!("{}MB across {} files", mb, file_count)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_readme_lines(md_file: &str) -> usize {
|
||||||
|
std::fs::read_to_string(md_file)
|
||||||
|
.map(|c| c.lines().count())
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// print a compact stats dashboard
|
||||||
|
fn print_stats(stats: &Stats, md_file: &str) {
|
||||||
|
let dl = stats.downloaded.load(Ordering::Relaxed);
|
||||||
|
let skip = stats.skipped.load(Ordering::Relaxed);
|
||||||
|
let fail = stats.failed.load(Ordering::Relaxed);
|
||||||
|
let pushes = stats.pushed.load(Ordering::Relaxed);
|
||||||
|
let readme_lines = get_readme_lines(md_file);
|
||||||
|
let disk = get_disk_usage();
|
||||||
|
println!(
|
||||||
|
"[stats] downloaded: {} | skipped: {} | failed: {} | pushes: {} | readme: {} lines | local disk: {}",
|
||||||
|
dl, skip, fail, pushes, readme_lines, disk
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
println!("=== site-archive scraper ===");
|
||||||
|
|
||||||
|
// Global limits and locks
|
||||||
|
let dl_semaphore = Arc::new(Semaphore::new(30)); // max 30 concurrent downloads across all tags
|
||||||
|
let md_mutex = Arc::new(Mutex::new(()));
|
||||||
|
let unpushed_count = Arc::new(Mutex::new(0u32));
|
||||||
|
let stats = Arc::new(Stats::new());
|
||||||
|
/* tag storage
|
||||||
|
"anime",
|
||||||
|
"genshin impact",
|
||||||
|
"wuthering waves",
|
||||||
|
"artwork",
|
||||||
|
"space",
|
||||||
|
"anime sexy",
|
||||||
|
"blue archive",
|
||||||
|
"video games",
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
// scrape wallpaperflare with specific tags
|
||||||
|
let flare_tags = vec![
|
||||||
|
"night",
|
||||||
|
"graphics",
|
||||||
|
"city",
|
||||||
|
"architecture",
|
||||||
|
"landscape",
|
||||||
|
"nature",
|
||||||
|
"space",
|
||||||
|
"fantasy art",
|
||||||
|
"honkai star rail",
|
||||||
|
"zenless zone zero",
|
||||||
|
"arknights",
|
||||||
|
"artistic",
|
||||||
|
"water",
|
||||||
|
"sky",
|
||||||
|
"river",
|
||||||
|
"art",
|
||||||
|
"trees",
|
||||||
|
"minecraft",
|
||||||
|
"painting",
|
||||||
|
"clouds",
|
||||||
|
"beauty in nature",
|
||||||
|
"tree",
|
||||||
|
"plant",
|
||||||
|
"scenics - nature",
|
||||||
|
"oil on canvas",
|
||||||
|
"tranquility",
|
||||||
|
"outside",
|
||||||
|
"tranquil scene",
|
||||||
|
"country",
|
||||||
|
"countryside",
|
||||||
|
"day",
|
||||||
|
"land",
|
||||||
|
"forest",
|
||||||
|
"cloud - sky",
|
||||||
|
"mountains",
|
||||||
|
"mountain",
|
||||||
|
"artistry",
|
||||||
|
"reflections",
|
||||||
|
"lake",
|
||||||
|
"scenic",
|
||||||
|
"non-urban scene",
|
||||||
|
"environment",
|
||||||
|
"people",
|
||||||
|
"loli",
|
||||||
|
"anime girls",
|
||||||
|
"ecchi",
|
||||||
|
"school uniform",
|
||||||
|
"Houkai Gakuen",
|
||||||
|
"Kiana Kaslana",
|
||||||
|
"thigh-highs",
|
||||||
|
"skirt",
|
||||||
|
"artwork",
|
||||||
|
"weapon",
|
||||||
|
"anime",
|
||||||
|
"Honkai",
|
||||||
|
"backgrounds",
|
||||||
|
"computer Graphic",
|
||||||
|
"technology",
|
||||||
|
"futuristic",
|
||||||
|
"vector",
|
||||||
|
"illustration",
|
||||||
|
"men",
|
||||||
|
"fantasy",
|
||||||
|
"astronomy",
|
||||||
|
"abstract",
|
||||||
|
"representation",
|
||||||
|
"indoors",
|
||||||
|
"still life",
|
||||||
|
"art and craft",
|
||||||
|
"no people",
|
||||||
|
"high angle view",
|
||||||
|
"creativity",
|
||||||
|
"human representation",
|
||||||
|
"celebration",
|
||||||
|
"table",
|
||||||
|
"multi colored",
|
||||||
|
"confetti",
|
||||||
|
"decoration",
|
||||||
|
"toy",
|
||||||
|
"close-up",
|
||||||
|
"large group of objects",
|
||||||
|
"craft",
|
||||||
|
"white",
|
||||||
|
"haired",
|
||||||
|
"female",
|
||||||
|
"character",
|
||||||
|
"manga",
|
||||||
|
"fan art",
|
||||||
|
"minimalism",
|
||||||
|
"monochrome",
|
||||||
|
"dark background",
|
||||||
|
"pantsu shot",
|
||||||
|
"uniform",
|
||||||
|
"selective coloring",
|
||||||
|
"ecchi",
|
||||||
|
"Tanaka Kotoha",
|
||||||
|
"gyorui",
|
||||||
|
"katsuwo drawing",
|
||||||
|
"map",
|
||||||
|
"thighs",
|
||||||
|
"science fiction",
|
||||||
|
"sunset",
|
||||||
|
"walking",
|
||||||
|
"woman",
|
||||||
|
"street",
|
||||||
|
"lantern",
|
||||||
|
];
|
||||||
|
|
||||||
|
let shared_client = match wallpaperflare::build_client() {
|
||||||
|
Ok(c) => Arc::new(c),
|
||||||
|
Err(e) => {
|
||||||
|
println!("failed to build client: {}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tasks = Vec::new();
|
||||||
|
for tag in flare_tags {
|
||||||
|
let sem = dl_semaphore.clone();
|
||||||
|
let mtx = md_mutex.clone();
|
||||||
|
let u_count = unpushed_count.clone();
|
||||||
|
let s = stats.clone();
|
||||||
|
let tag = tag.to_string();
|
||||||
|
let client = shared_client.clone();
|
||||||
|
tasks.push(tokio::spawn(async move {
|
||||||
|
scrape_source(client, "assets", "README.md", Some(&tag), u32::MAX, sem, mtx, u_count, s).await;
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all tag scraping tasks to finish
|
||||||
|
futures::future::join_all(tasks).await;
|
||||||
|
|
||||||
|
if std::env::var("GITHUB_ACTIONS").is_ok() {
|
||||||
|
let _ = std::fs::remove_file(".git/index.lock");
|
||||||
|
let _ = tokio::process::Command::new("git").args(["add", "--ignore-removal", "--sparse", "README.md", "assets"])
|
||||||
|
.stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await;
|
||||||
|
let _ = tokio::process::Command::new("git").args(["commit", "-m", "chore: sort readme alphabetically [skip ci]"])
|
||||||
|
.stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await;
|
||||||
|
// let _ = tokio::process::Command::new("git").args(["push"])
|
||||||
|
// .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await;
|
||||||
|
let _ = tokio::process::Command::new("git").args(["-c", "http.postBuffer=524288000", "push"]).status().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("=== all scraping complete! ===");
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn scrape_source(
|
||||||
|
client: Arc<wreq::Client>,
|
||||||
|
source_name: &str,
|
||||||
|
md_file: &str,
|
||||||
|
search_query: Option<&str>,
|
||||||
|
max_pages: u32,
|
||||||
|
dl_semaphore: Arc<Semaphore>,
|
||||||
|
md_mutex: Arc<Mutex<()>>,
|
||||||
|
unpushed_count: Arc<Mutex<u32>>,
|
||||||
|
stats: Arc<Stats>
|
||||||
|
) {
|
||||||
|
let tag_label = search_query.unwrap_or("all");
|
||||||
|
let output_dir = Path::new(source_name);
|
||||||
|
if !output_dir.exists() {
|
||||||
|
std::fs::create_dir_all(output_dir).unwrap_or(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut existing_ids = {
|
||||||
|
let _lock = md_mutex.lock().await;
|
||||||
|
load_existing_ids(source_name, md_file)
|
||||||
|
};
|
||||||
|
|
||||||
|
{
|
||||||
|
let _lock = md_mutex.lock().await;
|
||||||
|
let header = "# Wallpaper Archive\n\nAutomated archive of wallpapers to bypass Cloudflare and prevent dead links.\n\n## Gallery\n\n| Preview | Title | Tags |\n| --- | --- | --- |\n";
|
||||||
|
if !Path::new(md_file).exists() {
|
||||||
|
let _ = std::fs::write(md_file, header);
|
||||||
|
} else {
|
||||||
|
// make sure the table header exists in the file
|
||||||
|
// DANGER: never use unwrap_or_default() here! if read_to_string fails due to OOM,
|
||||||
|
// it will return "" and completely overwrite the 100k line file with just the header!
|
||||||
|
if let Ok(content) = std::fs::read_to_string(md_file) {
|
||||||
|
if !content.contains("| --- | --- | --- |") {
|
||||||
|
let _ = std::fs::write(md_file, format!("{}{}", header, content));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("[warn] failed to read {} to check header, skipping injection", md_file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut total_downloaded = 0u32;
|
||||||
|
let mut total_failed = 0u32;
|
||||||
|
let mut page = 1u32;
|
||||||
|
let mut consecutive_errors = 0u32;
|
||||||
|
let max_retries = 3u32;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if page > max_pages {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut attempt = 0;
|
||||||
|
let result = loop {
|
||||||
|
attempt += 1;
|
||||||
|
let scrape_res = wallpaperflare::scrape_wallpaperflare(&client, 12, page, search_query).await;
|
||||||
|
|
||||||
|
match scrape_res {
|
||||||
|
Ok(items) => break Ok(items),
|
||||||
|
Err(e) => {
|
||||||
|
if attempt >= max_retries {
|
||||||
|
break Err(e);
|
||||||
|
}
|
||||||
|
let wait = attempt * 5;
|
||||||
|
// println!("[retry] {} page {} attempt {}/{} failed: {} — waiting {}s...", source_name, page, attempt, max_retries, e, wait);
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(wait as u64)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(items) => {
|
||||||
|
consecutive_errors = 0;
|
||||||
|
|
||||||
|
if items.is_empty() {
|
||||||
|
// println!("[{}] exhausted at page {}", tag_label, page);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let mut page_downloaded = 0;
|
||||||
|
let mut new_readme_rows = String::new();
|
||||||
|
|
||||||
|
let mut download_tasks = Vec::new();
|
||||||
|
for item in items {
|
||||||
|
let slug = item.id.clone();
|
||||||
|
if existing_ids.contains(&slug) {
|
||||||
|
stats.skipped.fetch_add(1, Ordering::Relaxed);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
existing_ids.insert(slug.clone());
|
||||||
|
|
||||||
|
let output_dir = output_dir.to_path_buf();
|
||||||
|
let max_retries = max_retries;
|
||||||
|
let sem = dl_semaphore.clone();
|
||||||
|
let client = client.clone();
|
||||||
|
|
||||||
|
download_tasks.push(tokio::spawn(async move {
|
||||||
|
let _permit = sem.acquire().await.unwrap();
|
||||||
|
let ext = if item.download_url.contains(".png") { "png" } else { "jpg" };
|
||||||
|
let filename = format!("{}.{}", slug, ext);
|
||||||
|
let filepath = output_dir.join(&filename);
|
||||||
|
|
||||||
|
if filepath.exists() {
|
||||||
|
return Ok((slug, ext, item, filename, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
let manifest_path = output_dir.join(format!("{}.json", slug));
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(&item) {
|
||||||
|
let _ = std::fs::write(&manifest_path, json);
|
||||||
|
}
|
||||||
|
|
||||||
|
// silent download — stats printed per batch
|
||||||
|
|
||||||
|
for dl_attempt in 1..=max_retries {
|
||||||
|
let dl_res = wallpaperflare::download_wallpaper(&client, &item.download_url, &filepath).await;
|
||||||
|
|
||||||
|
match dl_res {
|
||||||
|
Ok(bytes) => return Ok((slug, ext, item, filename, bytes)),
|
||||||
|
Err(e) => {
|
||||||
|
// don't retry permanent errors — size rejections etc are not transient
|
||||||
|
if e.contains("too large") || e.contains("write failed") {
|
||||||
|
// permanent error, skip silently
|
||||||
|
let _ = std::fs::remove_file(&manifest_path);
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
|
if dl_attempt < max_retries {
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||||
|
} else {
|
||||||
|
let _ = std::fs::remove_file(&manifest_path);
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(())
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let results = futures::future::join_all(download_tasks).await;
|
||||||
|
|
||||||
|
for res in results {
|
||||||
|
if let Ok(Ok((_, _, item, filename, _bytes))) = res {
|
||||||
|
total_downloaded += 1;
|
||||||
|
stats.downloaded.fetch_add(1, Ordering::Relaxed);
|
||||||
|
page_downloaded += 1;
|
||||||
|
|
||||||
|
let cdn_url = format!("{}/{}", CDN_BASE, filename);
|
||||||
|
let tags = item.tags.join(", ");
|
||||||
|
new_readme_rows.push_str(&format!(
|
||||||
|
"| <img src=\"{}\" width=\"200\"> | **{}**<br>[Download]({}) | {} |\n",
|
||||||
|
cdn_url, item.title, cdn_url, tags
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
total_failed += 1;
|
||||||
|
stats.failed.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if page_downloaded > 0 {
|
||||||
|
let _lock = md_mutex.lock().await;
|
||||||
|
append_to_readme(md_file, &new_readme_rows);
|
||||||
|
|
||||||
|
let mut count = unpushed_count.lock().await;
|
||||||
|
*count += page_downloaded;
|
||||||
|
|
||||||
|
if *count >= 50 {
|
||||||
|
if std::env::var("GITHUB_ACTIONS").is_ok() {
|
||||||
|
println!("[push] freezing downloads to commit batch of {} images...", *count);
|
||||||
|
// acquire all 30 permits to absolutely guarantee NO other tags are downloading
|
||||||
|
// or mutating the assets/ directory while git is scanning it
|
||||||
|
let _freeze = dl_semaphore.acquire_many(30).await.unwrap();
|
||||||
|
|
||||||
|
let _ = std::fs::remove_file(".git/index.lock");
|
||||||
|
let _ = tokio::process::Command::new("git").args(["add", "--ignore-removal", "--sparse", "README.md", "assets"])
|
||||||
|
.stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await;
|
||||||
|
let _ = tokio::process::Command::new("git").args(["commit", "-m", "chore: archive batch of new wallpapers [skip ci]"])
|
||||||
|
.stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await;
|
||||||
|
let push_status = tokio::process::Command::new("git").args(["-c", "http.postBuffer=524288000", "push"])
|
||||||
|
.status().await;
|
||||||
|
|
||||||
|
if let Ok(s) = push_status {
|
||||||
|
if s.success() {
|
||||||
|
stats.pushed.fetch_add(1, Ordering::Relaxed);
|
||||||
|
println!("[push] success! cleaning up local assets to free disk...");
|
||||||
|
// nuke local image files after push to free disk space
|
||||||
|
// keep readme and .git intact obviously
|
||||||
|
if let Ok(entries) = std::fs::read_dir("assets") {
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
let _ = std::fs::remove_file(entry.path());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print_stats(&stats, md_file);
|
||||||
|
} else {
|
||||||
|
println!("[push] failed! keeping local files for retry");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
consecutive_errors += 1;
|
||||||
|
println!("[error] {} page {} failed after retries: {}", tag_label, page, e);
|
||||||
|
|
||||||
|
if consecutive_errors >= 5 {
|
||||||
|
println!("[halt] {} — too many consecutive failures", tag_label);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
page += 1;
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
||||||
|
}
|
||||||
|
println!("[done] {} — downloaded: {}, failed: {}", tag_label, total_downloaded, total_failed);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_existing_ids(source_name: &str, md_file: &str) -> HashSet<String> {
|
||||||
|
let mut ids = HashSet::new();
|
||||||
|
if let Ok(content) = std::fs::read_to_string(md_file) {
|
||||||
|
for line in content.lines() {
|
||||||
|
let search_str = format!("/{}/", source_name);
|
||||||
|
if let Some(start) = line.find(&search_str) {
|
||||||
|
let after = &line[start + search_str.len()..];
|
||||||
|
if let Some(dot) = after.find('.') {
|
||||||
|
let slug = &after[..dot];
|
||||||
|
if !slug.is_empty() {
|
||||||
|
ids.insert(slug.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ids
|
||||||
|
}
|
||||||
|
|
||||||
|
fn append_to_readme(md_file: &str, rows: &str) {
|
||||||
|
// read existing content, trim trailing whitespace to avoid blank lines
|
||||||
|
// breaking the markdown table, then append rows directly after
|
||||||
|
if let Ok(existing) = std::fs::read_to_string(md_file) {
|
||||||
|
let trimmed = existing.trim_end();
|
||||||
|
let new_content = format!("{}\n{}", trimmed, rows);
|
||||||
|
let _ = std::fs::write(md_file, new_content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
fn sort_readme(md_file: &str) {
|
||||||
|
let content = match std::fs::read_to_string(md_file) {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(_) => return,
|
||||||
|
};
|
||||||
|
|
||||||
|
let lines: Vec<&str> = content.lines().collect();
|
||||||
|
|
||||||
|
let mut header_lines = Vec::new();
|
||||||
|
let mut data_rows = Vec::new();
|
||||||
|
|
||||||
|
for line in &lines {
|
||||||
|
if line.starts_with("| <img") {
|
||||||
|
data_rows.push(*line);
|
||||||
|
} else {
|
||||||
|
if data_rows.is_empty() {
|
||||||
|
header_lines.push(*line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data_rows.sort();
|
||||||
|
|
||||||
|
let mut output = header_lines.join("\n");
|
||||||
|
output.push('\n');
|
||||||
|
for row in &data_rows {
|
||||||
|
output.push_str(row);
|
||||||
|
output.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = std::fs::write(md_file, output);
|
||||||
|
println!("sorted readme: {} entries alphabetically in {}", data_rows.len(), md_file);
|
||||||
|
}
|
||||||
375
src/wallpaperflare.rs
Normal file
375
src/wallpaperflare.rs
Normal file
|
|
@ -0,0 +1,375 @@
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use crate::wallpapersclan::WallpaperEntry;
|
||||||
|
|
||||||
|
pub fn build_client() -> Result<wreq::Client, String> {
|
||||||
|
use wreq::header::{HeaderMap, HeaderValue};
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
headers.insert("accept", HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"));
|
||||||
|
headers.insert("accept-encoding", HeaderValue::from_static("gzip, deflate, br, zstd"));
|
||||||
|
headers.insert("accept-language", HeaderValue::from_static("en-US,en;q=0.9,hi;q=0.8,de;q=0.7,ja;q=0.6"));
|
||||||
|
headers.insert("cache-control", HeaderValue::from_static("max-age=0"));
|
||||||
|
headers.insert("dnt", HeaderValue::from_static("1"));
|
||||||
|
headers.insert("priority", HeaderValue::from_static("u=0, i"));
|
||||||
|
headers.insert("sec-ch-ua", HeaderValue::from_static(r#""Chromium";v="148", "Google Chrome";v="148", "Not/A)Brand";v="99""#));
|
||||||
|
headers.insert("sec-ch-ua-mobile", HeaderValue::from_static("?0"));
|
||||||
|
headers.insert("sec-ch-ua-platform", HeaderValue::from_static(r#""Windows""#));
|
||||||
|
headers.insert("sec-fetch-dest", HeaderValue::from_static("document"));
|
||||||
|
headers.insert("sec-fetch-mode", HeaderValue::from_static("navigate"));
|
||||||
|
headers.insert("sec-fetch-site", HeaderValue::from_static("same-origin"));
|
||||||
|
headers.insert("sec-fetch-user", HeaderValue::from_static("?1"));
|
||||||
|
headers.insert("upgrade-insecure-requests", HeaderValue::from_static("1"));
|
||||||
|
headers.insert("user-agent", HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"));
|
||||||
|
|
||||||
|
wreq::Client::builder()
|
||||||
|
.emulation(wreq_util::Emulation::Chrome134)
|
||||||
|
.default_headers(headers)
|
||||||
|
.cookie_store(true)
|
||||||
|
.build()
|
||||||
|
.map_err(|e| e.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
// url normalization
|
||||||
|
pub fn absolute_url(href: &str, base: &str) -> String {
|
||||||
|
if href.starts_with("http://") || href.starts_with("https://") {
|
||||||
|
href.to_string()
|
||||||
|
} else if href.starts_with("//") {
|
||||||
|
format!("https:{}", href)
|
||||||
|
} else if href.starts_with('/') {
|
||||||
|
format!("{}{}", base.trim_end_matches('/'), href)
|
||||||
|
} else {
|
||||||
|
format!("{}/{}", base.trim_end_matches('/'), href)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn pick_image_source(value: &str) -> String {
|
||||||
|
if value.is_empty() {
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
let first_segment = value.split(',').next().unwrap_or("").trim();
|
||||||
|
first_segment
|
||||||
|
.trim_start_matches("url(\"")
|
||||||
|
.trim_start_matches("url('")
|
||||||
|
.trim_start_matches("url(")
|
||||||
|
.trim_end_matches("\")")
|
||||||
|
.trim_end_matches("')")
|
||||||
|
.trim_end_matches(")")
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn scrape_wallpaperflare(
|
||||||
|
client: &wreq::Client,
|
||||||
|
limit: usize,
|
||||||
|
page: u32,
|
||||||
|
search_query: Option<&str>,
|
||||||
|
) -> Result<Vec<WallpaperEntry>, String> {
|
||||||
|
// println!(
|
||||||
|
// "[scraper:wallpaperflare] starting scrape - page: {}, limit: {}",
|
||||||
|
// page, limit
|
||||||
|
// );
|
||||||
|
|
||||||
|
let url = if let Some(query) = search_query {
|
||||||
|
let q = query.replace(" ", "+");
|
||||||
|
if page > 1 {
|
||||||
|
format!("https://www.wallpaperflare.com/search?wallpaper={}&page={}", q, page)
|
||||||
|
} else {
|
||||||
|
format!("https://www.wallpaperflare.com/search?wallpaper={}", q)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if page > 1 {
|
||||||
|
format!("https://www.wallpaperflare.com/index.php?page={}", page)
|
||||||
|
} else {
|
||||||
|
"https://www.wallpaperflare.com/".to_string()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// println!("[scraper:wallpaperflare] fetching: {}", url);
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.get(&url)
|
||||||
|
.header("Referer", "https://www.wallpaperflare.com/")
|
||||||
|
.header("Sec-Fetch-Site", "same-origin")
|
||||||
|
.timeout(std::time::Duration::from_secs(20))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("request failed: {}", e))?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(format!("http {}", status));
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = response.text().await.map_err(|e| e.to_string())?;
|
||||||
|
|
||||||
|
if html.contains("cf-browser-verification") || html.contains("Checking your browser") {
|
||||||
|
// println!("[scraper:wallpaperflare] cloudflare challenge detected!");
|
||||||
|
return Err("cloudflare challenge - browser verification required".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut temp_items = Vec::new();
|
||||||
|
let mut seen_ids = HashSet::new();
|
||||||
|
|
||||||
|
{
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
let li_selector = Selector::parse("li[itemprop=\"associatedMedia\"]").unwrap();
|
||||||
|
let link_selector = Selector::parse("a[itemprop=\"url\"]").unwrap();
|
||||||
|
let img_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
||||||
|
let keywords_selector = Selector::parse("meta[itemprop=\"keywords\"]").unwrap();
|
||||||
|
|
||||||
|
for li_element in document.select(&li_selector) {
|
||||||
|
if temp_items.len() >= limit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let link_element = match li_element.select(&link_selector).next() {
|
||||||
|
Some(el) => el,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let href = link_element.value().attr("href").unwrap_or("");
|
||||||
|
if href.is_empty()
|
||||||
|
|| href.starts_with('#')
|
||||||
|
|| href.starts_with("/search")
|
||||||
|
|| href.starts_with("/tag")
|
||||||
|
|| href.starts_with("/page")
|
||||||
|
|| href == "/"
|
||||||
|
|| !href.contains("wallpaper")
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let normalized_href = absolute_url(href, "https://www.wallpaperflare.com");
|
||||||
|
if !normalized_href.to_lowercase().contains("wallpaper") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let media_elem = match link_element.select(&img_selector).next() {
|
||||||
|
Some(el) => el,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
let thumb = media_elem
|
||||||
|
.value()
|
||||||
|
.attr("data-src")
|
||||||
|
.or_else(|| media_elem.value().attr("data-original"))
|
||||||
|
.or_else(|| media_elem.value().attr("data-srcset"))
|
||||||
|
.or_else(|| media_elem.value().attr("srcset"))
|
||||||
|
.or_else(|| media_elem.value().attr("src"))
|
||||||
|
.map(pick_image_source)
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
if thumb.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let id = href
|
||||||
|
.trim_start_matches('/')
|
||||||
|
.split('-')
|
||||||
|
.next_back()
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
if id.is_empty() || id.len() < 3 || seen_ids.contains(&id) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen_ids.insert(id.clone());
|
||||||
|
|
||||||
|
let thumbnail_url = absolute_url(&thumb, "https://www.wallpaperflare.com");
|
||||||
|
let title = media_elem
|
||||||
|
.value()
|
||||||
|
.attr("alt")
|
||||||
|
.or_else(|| media_elem.value().attr("title"))
|
||||||
|
.unwrap_or("WallpaperFlare Wallpaper")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let tags = li_element
|
||||||
|
.select(&keywords_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("content"))
|
||||||
|
.map(|content| {
|
||||||
|
content
|
||||||
|
.split(',')
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.collect::<Vec<String>>()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
temp_items.push((id, title, thumbnail_url, normalized_href, tags));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if temp_items.is_empty() {
|
||||||
|
// println!("[scraper:wallpaperflare] no items found");
|
||||||
|
return Err("wallpaperflare returned no results".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// println!(
|
||||||
|
// "[scraper:wallpaperflare] collected {} items, resolving download urls...",
|
||||||
|
// temp_items.len()
|
||||||
|
// );
|
||||||
|
|
||||||
|
let mut handles = Vec::new();
|
||||||
|
for (id, title, thumb, detail_url, tags) in temp_items {
|
||||||
|
let client = client.clone();
|
||||||
|
let detail = detail_url.clone();
|
||||||
|
|
||||||
|
handles.push(tokio::spawn(async move {
|
||||||
|
let download_url = resolve_wallpaperflare_download(&client, &detail).await;
|
||||||
|
(id, title, thumb, detail_url, download_url, tags)
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut items = Vec::new();
|
||||||
|
for handle in handles {
|
||||||
|
match handle.await {
|
||||||
|
Ok((id, title, thumbnail_url, detail_url, download_result, tags)) => {
|
||||||
|
let download_url = match download_result {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(e) => {
|
||||||
|
// println!(" [warn] failed to resolve {}: {}", id, e);
|
||||||
|
thumbnail_url.clone()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
items.push(WallpaperEntry {
|
||||||
|
id: format!("wallpaperflare-{}", id),
|
||||||
|
title,
|
||||||
|
thumbnail_url,
|
||||||
|
detail_url,
|
||||||
|
download_url,
|
||||||
|
tags,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(_e) => {
|
||||||
|
// println!(" [warn] task failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// println!("[scraper:wallpaperflare] resolved {} download urls", items.len());
|
||||||
|
Ok(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn resolve_wallpaperflare_download(
|
||||||
|
client: &wreq::Client,
|
||||||
|
detail_url: &str,
|
||||||
|
) -> Result<String, String> {
|
||||||
|
let absolute = absolute_url(detail_url, "https://www.wallpaperflare.com");
|
||||||
|
let download_page_url = format!("{}/download", absolute.trim_end_matches('/'));
|
||||||
|
|
||||||
|
if let Ok(response) = client
|
||||||
|
.get(&download_page_url)
|
||||||
|
.header("Referer", &absolute)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
if let Ok(html) = response.text().await {
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
let show_img_selector = Selector::parse("#show_img").unwrap();
|
||||||
|
let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
||||||
|
|
||||||
|
let high_res_image = document
|
||||||
|
.select(&show_img_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("src"))
|
||||||
|
.or_else(|| {
|
||||||
|
document
|
||||||
|
.select(&content_url_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("src"))
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(img_url) = high_res_image {
|
||||||
|
let final_url = absolute_url(img_url, "https://www.wallpaperflare.com");
|
||||||
|
return Ok(final_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match client.get(&absolute)
|
||||||
|
.header("Referer", "https://www.wallpaperflare.com/")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(response) => {
|
||||||
|
let html = response.text().await.map_err(|e| e.to_string())?;
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
||||||
|
let vimg_selector = Selector::parse("#vimg").unwrap();
|
||||||
|
let og_image_selector = Selector::parse("meta[property=\"og:image\"]").unwrap();
|
||||||
|
|
||||||
|
let detail_image = document
|
||||||
|
.select(&content_url_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("src"))
|
||||||
|
.map(pick_image_source)
|
||||||
|
.or_else(|| {
|
||||||
|
document
|
||||||
|
.select(&vimg_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("src"))
|
||||||
|
.map(pick_image_source)
|
||||||
|
})
|
||||||
|
.or_else(|| {
|
||||||
|
document
|
||||||
|
.select(&og_image_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|el| el.value().attr("content"))
|
||||||
|
.map(pick_image_source)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(img_url) = detail_image {
|
||||||
|
let final_url = absolute_url(&img_url, "https://www.wallpaperflare.com");
|
||||||
|
return Ok(final_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
Err("no image found on detail page".to_string())
|
||||||
|
}
|
||||||
|
Err(e) => Err(format!("failed to fetch detail page: {}", e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn download_wallpaper(client: &wreq::Client, url: &str, path: &Path) -> Result<u64, String> {
|
||||||
|
const MAX_FILE_SIZE: u64 = 30 * 1024 * 1024;
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.get(url)
|
||||||
|
.header("Referer", "https://www.wallpaperflare.com/")
|
||||||
|
.timeout(std::time::Duration::from_secs(60))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("download request failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(format!("http {}", response.status()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// bail early using content-length header — no need to download 140 mb of garbage
|
||||||
|
if let Some(cl) = response.content_length() {
|
||||||
|
if cl > MAX_FILE_SIZE {
|
||||||
|
return Err(format!(
|
||||||
|
"file too large ({:.2} MB, limit is {} MB) — skipping",
|
||||||
|
cl as f64 / (1024.0 * 1024.0),
|
||||||
|
MAX_FILE_SIZE / (1024 * 1024)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = response.bytes().await.map_err(|e| e.to_string())?;
|
||||||
|
let len = bytes.len() as u64;
|
||||||
|
|
||||||
|
// safety net in case content-length header was missing or lied
|
||||||
|
if len > MAX_FILE_SIZE {
|
||||||
|
return Err(format!(
|
||||||
|
"file too large ({:.2} MB, limit is {} MB) — skipping",
|
||||||
|
len as f64 / (1024.0 * 1024.0),
|
||||||
|
MAX_FILE_SIZE / (1024 * 1024)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?;
|
||||||
|
|
||||||
|
Ok(len)
|
||||||
|
}
|
||||||
352
src/wallpapersclan.rs
Normal file
352
src/wallpapersclan.rs
Normal file
|
|
@ -0,0 +1,352 @@
|
||||||
|
#![allow(dead_code, unused_variables)]
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use std::path::Path;
|
||||||
|
use wreq_util::Emulation;
|
||||||
|
|
||||||
|
const BASE_URL: &str = "https://wallpapers-clan.com";
|
||||||
|
const DESKTOP_URL: &str = "https://wallpapers-clan.com/desktop-wallpapers/";
|
||||||
|
|
||||||
|
use serde::{Serialize, Deserialize};
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct WallpaperEntry {
|
||||||
|
pub id: String,
|
||||||
|
pub title: String,
|
||||||
|
pub thumbnail_url: String,
|
||||||
|
pub detail_url: String,
|
||||||
|
pub download_url: String,
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// build a wreq client impersonating chrome — no cookies needed,
|
||||||
|
/// the tls fingerprint alone bypasses cloudflare's managed challenge
|
||||||
|
fn build_client() -> Result<wreq::Client, String> {
|
||||||
|
wreq::Client::builder()
|
||||||
|
.emulation(Emulation::Chrome134)
|
||||||
|
.cookie_store(true)
|
||||||
|
.build()
|
||||||
|
.map_err(|e| e.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// scrape the listing page and resolve download urls from detail pages
|
||||||
|
pub async fn scrape_wallpapersclan(
|
||||||
|
limit: usize,
|
||||||
|
page: u32,
|
||||||
|
) -> Result<Vec<WallpaperEntry>, String> {
|
||||||
|
let client = build_client()?;
|
||||||
|
|
||||||
|
let (url, response) = if page > 1 {
|
||||||
|
let ajax_url = format!("{}/wp-admin/admin-ajax.php", BASE_URL);
|
||||||
|
println!("[listing] fetching (ajax): {} (page {})", ajax_url, page);
|
||||||
|
|
||||||
|
let mut form = std::collections::HashMap::new();
|
||||||
|
form.insert("action", "boldlab_get_new_posts");
|
||||||
|
form.insert("options[plugin]", "boldlab_core");
|
||||||
|
form.insert("options[module]", "post-types/dwallpapers/shortcodes");
|
||||||
|
form.insert("options[shortcode]", "dwallpapers-list");
|
||||||
|
form.insert("options[post_type]", "dwallpapers");
|
||||||
|
let page_str = page.to_string();
|
||||||
|
form.insert("options[next_page]", &page_str);
|
||||||
|
form.insert("options[max_pages_num]", "863");
|
||||||
|
form.insert("options[show_category]", "no");
|
||||||
|
form.insert("options[behavior]", "columns");
|
||||||
|
form.insert("options[images_proportion]", "full");
|
||||||
|
form.insert("options[columns]", "3");
|
||||||
|
form.insert("options[space]", "normal");
|
||||||
|
form.insert("options[columns_responsive]", "predefined");
|
||||||
|
form.insert("options[columns_1440]", "3");
|
||||||
|
form.insert("options[columns_1366]", "3");
|
||||||
|
form.insert("options[columns_1024]", "3");
|
||||||
|
form.insert("options[columns_768]", "3");
|
||||||
|
form.insert("options[columns_680]", "3");
|
||||||
|
form.insert("options[columns_480]", "3");
|
||||||
|
form.insert("options[posts_per_page]", "12");
|
||||||
|
form.insert("options[orderby]", "date");
|
||||||
|
form.insert("options[order]", "DESC");
|
||||||
|
form.insert("options[additional_params]", "tax");
|
||||||
|
form.insert("options[layout]", "info-below");
|
||||||
|
form.insert("options[hover_animation_info-below]", "tilt");
|
||||||
|
form.insert("options[hover_animation_info-follow]", "follow");
|
||||||
|
form.insert("options[hover_animation_info-on-hover]", "direction-aware");
|
||||||
|
form.insert("options[title_tag]", "h4");
|
||||||
|
form.insert("options[custom_padding]", "no");
|
||||||
|
form.insert("options[enable_filter]", "yes");
|
||||||
|
form.insert("options[pagination_type]", "infinite-scroll");
|
||||||
|
form.insert("options[loading_animation]", "no");
|
||||||
|
form.insert("options[object_class_name]", "BoldlabCoredwallpapersListShortcode");
|
||||||
|
form.insert("options[taxonomy_filter]", "dwallpapers-category");
|
||||||
|
form.insert("options[space_value]", "15");
|
||||||
|
form.insert("options[justified_attr]", "{\"rowHeight\":\"\",\"spaceBetween\":15}");
|
||||||
|
|
||||||
|
let resp = client
|
||||||
|
.post(&ajax_url)
|
||||||
|
.header("Referer", DESKTOP_URL)
|
||||||
|
.header("X-Requested-With", "XMLHttpRequest")
|
||||||
|
.form(&form)
|
||||||
|
.timeout(std::time::Duration::from_secs(20))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("ajax request failed: {}", e))?;
|
||||||
|
|
||||||
|
(ajax_url, resp)
|
||||||
|
} else {
|
||||||
|
println!("[listing] fetching: {}", DESKTOP_URL);
|
||||||
|
let resp = client
|
||||||
|
.get(DESKTOP_URL)
|
||||||
|
.header("Referer", BASE_URL)
|
||||||
|
.timeout(std::time::Duration::from_secs(20))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("request failed: {}", e))?;
|
||||||
|
(DESKTOP_URL.to_string(), resp)
|
||||||
|
};
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
println!("[listing] http {}", status);
|
||||||
|
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(format!("HTTP {} from wallpapers-clan", status));
|
||||||
|
}
|
||||||
|
|
||||||
|
let raw_text = response.text().await.map_err(|e| e.to_string())?;
|
||||||
|
|
||||||
|
let html = if page > 1 {
|
||||||
|
println!("[listing] raw response: {:.200}", raw_text);
|
||||||
|
// the ajax response is JSON with a "data" string containing the HTML
|
||||||
|
let json: serde_json::Value = serde_json::from_str(&raw_text)
|
||||||
|
.map_err(|e| format!("failed to parse ajax json: {}", e))?;
|
||||||
|
json["data"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string()
|
||||||
|
} else {
|
||||||
|
raw_text
|
||||||
|
};
|
||||||
|
|
||||||
|
// first pass: collect listing data
|
||||||
|
let mut listing_items: Vec<(String, String, String, Vec<String>)> = Vec::new();
|
||||||
|
|
||||||
|
{
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
|
||||||
|
// selectors for the qodef grid layout
|
||||||
|
let article_selector = Selector::parse("article.qodef-grid-item").unwrap();
|
||||||
|
let media_link_selector = Selector::parse(".qodef-e-media-image a[itemprop='url']").unwrap();
|
||||||
|
let img_selector = Selector::parse("img.wp-post-image").unwrap();
|
||||||
|
let noscript_selector = Selector::parse("noscript").unwrap();
|
||||||
|
let title_selector = Selector::parse("h4.qodef-e-title a.qodef-e-title-link").unwrap();
|
||||||
|
let category_selector = Selector::parse(".qodef-e-info-category a.qodef-e-category").unwrap();
|
||||||
|
|
||||||
|
let articles: Vec<_> = document.select(&article_selector).collect();
|
||||||
|
println!("[listing] found {} articles", articles.len());
|
||||||
|
|
||||||
|
for article in articles.iter() {
|
||||||
|
if listing_items.len() >= limit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// detail page url
|
||||||
|
let detail_url = match article.select(&media_link_selector).next() {
|
||||||
|
Some(a) => match a.value().attr("href") {
|
||||||
|
Some(href) if href.contains("desktop-wallpapers") => href.to_string(),
|
||||||
|
_ => continue,
|
||||||
|
},
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
// thumbnail — data-lazy-src > data-lazy-srcset > noscript fallback
|
||||||
|
let thumbnail_url = article
|
||||||
|
.select(&img_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|img| {
|
||||||
|
if let Some(src) = img.value().attr("data-lazy-src") {
|
||||||
|
if !src.contains("data:image/svg") {
|
||||||
|
return Some(src.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(srcset) = img.value().attr("data-lazy-srcset") {
|
||||||
|
if let Some(first) = srcset.split(',').next() {
|
||||||
|
let url = first.trim().split_whitespace().next().unwrap_or("");
|
||||||
|
if !url.is_empty() && !url.contains("data:image/svg") {
|
||||||
|
return Some(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(s) = img.value().attr("src") {
|
||||||
|
if !s.contains("data:image/svg") {
|
||||||
|
return Some(s.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
})
|
||||||
|
.or_else(|| {
|
||||||
|
article.select(&noscript_selector).next().and_then(|ns| {
|
||||||
|
let inner = ns.inner_html();
|
||||||
|
let frag = Html::parse_fragment(&inner);
|
||||||
|
let img_sel = Selector::parse("img").unwrap();
|
||||||
|
frag.select(&img_sel).next().and_then(|img| {
|
||||||
|
img.value()
|
||||||
|
.attr("src")
|
||||||
|
.filter(|s| !s.contains("data:image/svg"))
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
let thumbnail_url = match thumbnail_url {
|
||||||
|
Some(url) => url,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
// title
|
||||||
|
let title = article
|
||||||
|
.select(&title_selector)
|
||||||
|
.next()
|
||||||
|
.map(|t| t.text().collect::<String>().trim().to_string())
|
||||||
|
.unwrap_or_else(|| "Untitled".to_string());
|
||||||
|
|
||||||
|
// tags from categories
|
||||||
|
let tags: Vec<String> = article
|
||||||
|
.select(&category_selector)
|
||||||
|
.map(|cat| cat.text().collect::<String>().trim().to_string())
|
||||||
|
.filter(|t| !t.is_empty())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
listing_items.push((detail_url, thumbnail_url, title, tags));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("[listing] collected {} items, resolving download urls...", listing_items.len());
|
||||||
|
|
||||||
|
// second pass: resolve download urls from detail pages concurrently
|
||||||
|
let mut handles = Vec::new();
|
||||||
|
for (detail_url, thumb, title, tags) in listing_items {
|
||||||
|
let client = client.clone();
|
||||||
|
let detail = detail_url.clone();
|
||||||
|
handles.push(tokio::spawn(async move {
|
||||||
|
let download_url = resolve_download(&client, &detail).await;
|
||||||
|
(detail_url, thumb, title, tags, download_url)
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut items = Vec::new();
|
||||||
|
for handle in handles {
|
||||||
|
match handle.await {
|
||||||
|
Ok((detail_url, thumbnail_url, title, tags, download_result)) => {
|
||||||
|
// slug for id
|
||||||
|
let slug = detail_url
|
||||||
|
.trim_end_matches('/')
|
||||||
|
.split('/')
|
||||||
|
.next_back()
|
||||||
|
.unwrap_or("unknown")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let download_url = match download_result {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(e) => {
|
||||||
|
println!(" [warn] failed to resolve {}: {}", slug, e);
|
||||||
|
// fallback to thumbnail as download
|
||||||
|
thumbnail_url.clone()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
items.push(WallpaperEntry {
|
||||||
|
id: slug,
|
||||||
|
title,
|
||||||
|
thumbnail_url,
|
||||||
|
detail_url,
|
||||||
|
download_url,
|
||||||
|
tags,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!(" [warn] task failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("[listing] resolved {} download urls", items.len());
|
||||||
|
Ok(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// resolve the actual download url from a detail page
|
||||||
|
/// grabs a.wpdm-download-link[data-downloadurl] — baked in by wordpress
|
||||||
|
async fn resolve_download(client: &wreq::Client, detail_url: &str) -> Result<String, String> {
|
||||||
|
let response = client
|
||||||
|
.get(detail_url)
|
||||||
|
.header("Referer", DESKTOP_URL)
|
||||||
|
.timeout(std::time::Duration::from_secs(15))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("request failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(format!("HTTP {}", response.status()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = response.text().await.map_err(|e| e.to_string())?;
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
|
||||||
|
// primary: wpdm download button with data-downloadurl
|
||||||
|
let download_btn = Selector::parse("a.wpdm-download-link").unwrap();
|
||||||
|
if let Some(btn) = document.select(&download_btn).next() {
|
||||||
|
if let Some(url) = btn.value().attr("data-downloadurl") {
|
||||||
|
if !url.is_empty() {
|
||||||
|
return Ok(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fallback: any download link
|
||||||
|
let fallback = Selector::parse(".media-body a[href*='download']").unwrap();
|
||||||
|
if let Some(link) = document.select(&fallback).next() {
|
||||||
|
if let Some(href) = link.value().attr("href") {
|
||||||
|
return Ok(href.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// last resort: full-res image on the page
|
||||||
|
let img_sel = Selector::parse("img.wp-post-image").unwrap();
|
||||||
|
if let Some(img) = document.select(&img_sel).next() {
|
||||||
|
let src = img
|
||||||
|
.value()
|
||||||
|
.attr("data-lazy-src")
|
||||||
|
.or_else(|| {
|
||||||
|
img.value().attr("data-lazy-srcset").and_then(|srcset| {
|
||||||
|
srcset.split(',').next().and_then(|s| s.trim().split_whitespace().next())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.or_else(|| img.value().attr("src"))
|
||||||
|
.filter(|s| !s.contains("data:image/svg"));
|
||||||
|
|
||||||
|
if let Some(url) = src {
|
||||||
|
return Ok(url.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err("no download url found".to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// download a file to disk, returns bytes written
|
||||||
|
pub async fn download_wallpaper(url: &str, path: &Path) -> Result<u64, String> {
|
||||||
|
let client = build_client()?;
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.get(url)
|
||||||
|
.header("Referer", DESKTOP_URL)
|
||||||
|
.timeout(std::time::Duration::from_secs(60))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("download request failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(format!("HTTP {}", response.status()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = response.bytes().await.map_err(|e| e.to_string())?;
|
||||||
|
let len = bytes.len() as u64;
|
||||||
|
|
||||||
|
std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?;
|
||||||
|
|
||||||
|
Ok(len)
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue