From a4f0265fdc53b4d7524293ccd986caedb153eb8f Mon Sep 17 00:00:00 2001 From: LaxentaInc Date: Sun, 7 Jun 2026 16:26:34 +0530 Subject: [PATCH] i know wpflare likes being touched by a cf bypass, thats why they dont pull thier panties back up(go on im under attack mode) --- src/main.rs | 520 ++++++++++++++++++++++++++++++++++++++++++ src/wallpaperflare.rs | 375 ++++++++++++++++++++++++++++++ src/wallpapersclan.rs | 352 ++++++++++++++++++++++++++++ 3 files changed, 1247 insertions(+) create mode 100644 src/main.rs create mode 100644 src/wallpaperflare.rs create mode 100644 src/wallpapersclan.rs diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..ac705f0 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,520 @@ +mod wallpapersclan; +mod wallpaperflare; + +use std::collections::HashSet; +use std::path::Path; + +const CDN_BASE: &str = "https://raw.githubusercontent.com/yapude/wallpapers/main/assets"; + +use std::sync::Arc; +use std::sync::atomic::{AtomicU32, Ordering}; +use tokio::sync::{Mutex, Semaphore}; + +// global stats that all tasks can update without locking +struct Stats { + downloaded: AtomicU32, + skipped: AtomicU32, + failed: AtomicU32, + pushed: AtomicU32, +} + +impl Stats { + fn new() -> Self { + Self { + downloaded: AtomicU32::new(0), + skipped: AtomicU32::new(0), + failed: AtomicU32::new(0), + pushed: AtomicU32::new(0), + } + } +} + +// get disk usage stats for the runner +fn get_disk_usage() -> String { + let output_dir = Path::new("assets"); + let mut total_bytes: u64 = 0; + let mut file_count: u64 = 0; + if let Ok(entries) = std::fs::read_dir(output_dir) { + for entry in entries.flatten() { + if let Ok(meta) = entry.metadata() { + if meta.is_file() { + total_bytes += meta.len(); + file_count += 1; + } + } + } + } + let mb = total_bytes / (1024 * 1024); + format!("{}MB across {} files", mb, file_count) +} + +fn get_readme_lines(md_file: &str) -> usize { + std::fs::read_to_string(md_file) + .map(|c| c.lines().count()) + .unwrap_or(0) +} + +// print a compact stats dashboard +fn print_stats(stats: &Stats, md_file: &str) { + let dl = stats.downloaded.load(Ordering::Relaxed); + let skip = stats.skipped.load(Ordering::Relaxed); + let fail = stats.failed.load(Ordering::Relaxed); + let pushes = stats.pushed.load(Ordering::Relaxed); + let readme_lines = get_readme_lines(md_file); + let disk = get_disk_usage(); + println!( + "[stats] downloaded: {} | skipped: {} | failed: {} | pushes: {} | readme: {} lines | local disk: {}", + dl, skip, fail, pushes, readme_lines, disk + ); +} + +#[tokio::main] +async fn main() { + println!("=== site-archive scraper ==="); + + // Global limits and locks + let dl_semaphore = Arc::new(Semaphore::new(30)); // max 30 concurrent downloads across all tags + let md_mutex = Arc::new(Mutex::new(())); + let unpushed_count = Arc::new(Mutex::new(0u32)); + let stats = Arc::new(Stats::new()); +/* tag storage + "anime", + "genshin impact", + "wuthering waves", + "artwork", + "space", + "anime sexy", + "blue archive", + "video games", +----------------------- + + +*/ + // scrape wallpaperflare with specific tags + let flare_tags = vec![ + "night", + "graphics", + "city", + "architecture", + "landscape", + "nature", + "space", + "fantasy art", + "honkai star rail", + "zenless zone zero", + "arknights", + "artistic", + "water", + "sky", + "river", + "art", + "trees", + "minecraft", + "painting", + "clouds", + "beauty in nature", + "tree", + "plant", + "scenics - nature", + "oil on canvas", + "tranquility", + "outside", + "tranquil scene", + "country", + "countryside", + "day", + "land", + "forest", + "cloud - sky", + "mountains", + "mountain", + "artistry", + "reflections", + "lake", + "scenic", + "non-urban scene", + "environment", + "people", + "loli", + "anime girls", + "ecchi", + "school uniform", + "Houkai Gakuen", + "Kiana Kaslana", + "thigh-highs", + "skirt", + "artwork", + "weapon", + "anime", + "Honkai", + "backgrounds", + "computer Graphic", + "technology", + "futuristic", + "vector", + "illustration", + "men", + "fantasy", + "astronomy", + "abstract", + "representation", + "indoors", + "still life", + "art and craft", + "no people", + "high angle view", + "creativity", + "human representation", + "celebration", + "table", + "multi colored", + "confetti", + "decoration", + "toy", + "close-up", + "large group of objects", + "craft", + "white", + "haired", + "female", + "character", + "manga", + "fan art", + "minimalism", + "monochrome", + "dark background", + "pantsu shot", + "uniform", + "selective coloring", + "ecchi", + "Tanaka Kotoha", + "gyorui", + "katsuwo drawing", + "map", + "thighs", + "science fiction", + "sunset", + "walking", + "woman", + "street", + "lantern", + ]; + + let shared_client = match wallpaperflare::build_client() { + Ok(c) => Arc::new(c), + Err(e) => { + println!("failed to build client: {}", e); + return; + } + }; + + let mut tasks = Vec::new(); + for tag in flare_tags { + let sem = dl_semaphore.clone(); + let mtx = md_mutex.clone(); + let u_count = unpushed_count.clone(); + let s = stats.clone(); + let tag = tag.to_string(); + let client = shared_client.clone(); + tasks.push(tokio::spawn(async move { + scrape_source(client, "assets", "README.md", Some(&tag), u32::MAX, sem, mtx, u_count, s).await; + })); + } + + // Wait for all tag scraping tasks to finish + futures::future::join_all(tasks).await; + + if std::env::var("GITHUB_ACTIONS").is_ok() { + let _ = std::fs::remove_file(".git/index.lock"); + let _ = tokio::process::Command::new("git").args(["add", "--ignore-removal", "--sparse", "README.md", "assets"]) + .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await; + let _ = tokio::process::Command::new("git").args(["commit", "-m", "chore: sort readme alphabetically [skip ci]"]) + .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await; + // let _ = tokio::process::Command::new("git").args(["push"]) + // .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await; + let _ = tokio::process::Command::new("git").args(["-c", "http.postBuffer=524288000", "push"]).status().await; + } + + println!("=== all scraping complete! ==="); +} + +async fn scrape_source( + client: Arc, + source_name: &str, + md_file: &str, + search_query: Option<&str>, + max_pages: u32, + dl_semaphore: Arc, + md_mutex: Arc>, + unpushed_count: Arc>, + stats: Arc +) { + let tag_label = search_query.unwrap_or("all"); + let output_dir = Path::new(source_name); + if !output_dir.exists() { + std::fs::create_dir_all(output_dir).unwrap_or(()); + } + + let mut existing_ids = { + let _lock = md_mutex.lock().await; + load_existing_ids(source_name, md_file) + }; + + { + let _lock = md_mutex.lock().await; + let header = "# Wallpaper Archive\n\nAutomated archive of wallpapers to bypass Cloudflare and prevent dead links.\n\n## Gallery\n\n| Preview | Title | Tags |\n| --- | --- | --- |\n"; + if !Path::new(md_file).exists() { + let _ = std::fs::write(md_file, header); + } else { + // make sure the table header exists in the file + // DANGER: never use unwrap_or_default() here! if read_to_string fails due to OOM, + // it will return "" and completely overwrite the 100k line file with just the header! + if let Ok(content) = std::fs::read_to_string(md_file) { + if !content.contains("| --- | --- | --- |") { + let _ = std::fs::write(md_file, format!("{}{}", header, content)); + } + } else { + println!("[warn] failed to read {} to check header, skipping injection", md_file); + } + } + } + + let mut total_downloaded = 0u32; + let mut total_failed = 0u32; + let mut page = 1u32; + let mut consecutive_errors = 0u32; + let max_retries = 3u32; + + loop { + if page > max_pages { + break; + } + + let mut attempt = 0; + let result = loop { + attempt += 1; + let scrape_res = wallpaperflare::scrape_wallpaperflare(&client, 12, page, search_query).await; + + match scrape_res { + Ok(items) => break Ok(items), + Err(e) => { + if attempt >= max_retries { + break Err(e); + } + let wait = attempt * 5; + // println!("[retry] {} page {} attempt {}/{} failed: {} — waiting {}s...", source_name, page, attempt, max_retries, e, wait); + tokio::time::sleep(std::time::Duration::from_secs(wait as u64)).await; + } + } + }; + + match result { + Ok(items) => { + consecutive_errors = 0; + + if items.is_empty() { + // println!("[{}] exhausted at page {}", tag_label, page); + break; + } + let mut page_downloaded = 0; + let mut new_readme_rows = String::new(); + + let mut download_tasks = Vec::new(); + for item in items { + let slug = item.id.clone(); + if existing_ids.contains(&slug) { + stats.skipped.fetch_add(1, Ordering::Relaxed); + continue; + } + existing_ids.insert(slug.clone()); + + let output_dir = output_dir.to_path_buf(); + let max_retries = max_retries; + let sem = dl_semaphore.clone(); + let client = client.clone(); + + download_tasks.push(tokio::spawn(async move { + let _permit = sem.acquire().await.unwrap(); + let ext = if item.download_url.contains(".png") { "png" } else { "jpg" }; + let filename = format!("{}.{}", slug, ext); + let filepath = output_dir.join(&filename); + + if filepath.exists() { + return Ok((slug, ext, item, filename, 0)); + } + + let manifest_path = output_dir.join(format!("{}.json", slug)); + if let Ok(json) = serde_json::to_string_pretty(&item) { + let _ = std::fs::write(&manifest_path, json); + } + + // silent download — stats printed per batch + + for dl_attempt in 1..=max_retries { + let dl_res = wallpaperflare::download_wallpaper(&client, &item.download_url, &filepath).await; + + match dl_res { + Ok(bytes) => return Ok((slug, ext, item, filename, bytes)), + Err(e) => { + // don't retry permanent errors — size rejections etc are not transient + if e.contains("too large") || e.contains("write failed") { + // permanent error, skip silently + let _ = std::fs::remove_file(&manifest_path); + return Err(()); + } + if dl_attempt < max_retries { + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + } else { + let _ = std::fs::remove_file(&manifest_path); + return Err(()); + } + } + } + } + Err(()) + })); + } + + let results = futures::future::join_all(download_tasks).await; + + for res in results { + if let Ok(Ok((_, _, item, filename, _bytes))) = res { + total_downloaded += 1; + stats.downloaded.fetch_add(1, Ordering::Relaxed); + page_downloaded += 1; + + let cdn_url = format!("{}/{}", CDN_BASE, filename); + let tags = item.tags.join(", "); + new_readme_rows.push_str(&format!( + "| | **{}**
[Download]({}) | {} |\n", + cdn_url, item.title, cdn_url, tags + )); + } else { + total_failed += 1; + stats.failed.fetch_add(1, Ordering::Relaxed); + } + } + + if page_downloaded > 0 { + let _lock = md_mutex.lock().await; + append_to_readme(md_file, &new_readme_rows); + + let mut count = unpushed_count.lock().await; + *count += page_downloaded; + + if *count >= 50 { + if std::env::var("GITHUB_ACTIONS").is_ok() { + println!("[push] freezing downloads to commit batch of {} images...", *count); + // acquire all 30 permits to absolutely guarantee NO other tags are downloading + // or mutating the assets/ directory while git is scanning it + let _freeze = dl_semaphore.acquire_many(30).await.unwrap(); + + let _ = std::fs::remove_file(".git/index.lock"); + let _ = tokio::process::Command::new("git").args(["add", "--ignore-removal", "--sparse", "README.md", "assets"]) + .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await; + let _ = tokio::process::Command::new("git").args(["commit", "-m", "chore: archive batch of new wallpapers [skip ci]"]) + .stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().await; + let push_status = tokio::process::Command::new("git").args(["-c", "http.postBuffer=524288000", "push"]) + .status().await; + + if let Ok(s) = push_status { + if s.success() { + stats.pushed.fetch_add(1, Ordering::Relaxed); + println!("[push] success! cleaning up local assets to free disk..."); + // nuke local image files after push to free disk space + // keep readme and .git intact obviously + if let Ok(entries) = std::fs::read_dir("assets") { + for entry in entries.flatten() { + let _ = std::fs::remove_file(entry.path()); + } + } + print_stats(&stats, md_file); + } else { + println!("[push] failed! keeping local files for retry"); + } + } + } + *count = 0; + } + } + } + Err(e) => { + consecutive_errors += 1; + println!("[error] {} page {} failed after retries: {}", tag_label, page, e); + + if consecutive_errors >= 5 { + println!("[halt] {} — too many consecutive failures", tag_label); + break; + } + } + } + + page += 1; + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + println!("[done] {} — downloaded: {}, failed: {}", tag_label, total_downloaded, total_failed); +} + +fn load_existing_ids(source_name: &str, md_file: &str) -> HashSet { + let mut ids = HashSet::new(); + if let Ok(content) = std::fs::read_to_string(md_file) { + for line in content.lines() { + let search_str = format!("/{}/", source_name); + if let Some(start) = line.find(&search_str) { + let after = &line[start + search_str.len()..]; + if let Some(dot) = after.find('.') { + let slug = &after[..dot]; + if !slug.is_empty() { + ids.insert(slug.to_string()); + } + } + } + } + } + ids +} + +fn append_to_readme(md_file: &str, rows: &str) { + // read existing content, trim trailing whitespace to avoid blank lines + // breaking the markdown table, then append rows directly after + if let Ok(existing) = std::fs::read_to_string(md_file) { + let trimmed = existing.trim_end(); + let new_content = format!("{}\n{}", trimmed, rows); + let _ = std::fs::write(md_file, new_content); + } +} + +#[allow(dead_code)] +fn sort_readme(md_file: &str) { + let content = match std::fs::read_to_string(md_file) { + Ok(c) => c, + Err(_) => return, + }; + + let lines: Vec<&str> = content.lines().collect(); + + let mut header_lines = Vec::new(); + let mut data_rows = Vec::new(); + + for line in &lines { + if line.starts_with("| Result { + use wreq::header::{HeaderMap, HeaderValue}; + let mut headers = HeaderMap::new(); + headers.insert("accept", HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")); + headers.insert("accept-encoding", HeaderValue::from_static("gzip, deflate, br, zstd")); + headers.insert("accept-language", HeaderValue::from_static("en-US,en;q=0.9,hi;q=0.8,de;q=0.7,ja;q=0.6")); + headers.insert("cache-control", HeaderValue::from_static("max-age=0")); + headers.insert("dnt", HeaderValue::from_static("1")); + headers.insert("priority", HeaderValue::from_static("u=0, i")); + headers.insert("sec-ch-ua", HeaderValue::from_static(r#""Chromium";v="148", "Google Chrome";v="148", "Not/A)Brand";v="99""#)); + headers.insert("sec-ch-ua-mobile", HeaderValue::from_static("?0")); + headers.insert("sec-ch-ua-platform", HeaderValue::from_static(r#""Windows""#)); + headers.insert("sec-fetch-dest", HeaderValue::from_static("document")); + headers.insert("sec-fetch-mode", HeaderValue::from_static("navigate")); + headers.insert("sec-fetch-site", HeaderValue::from_static("same-origin")); + headers.insert("sec-fetch-user", HeaderValue::from_static("?1")); + headers.insert("upgrade-insecure-requests", HeaderValue::from_static("1")); + headers.insert("user-agent", HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36")); + + wreq::Client::builder() + .emulation(wreq_util::Emulation::Chrome134) + .default_headers(headers) + .cookie_store(true) + .build() + .map_err(|e| e.to_string()) +} + +// url normalization +pub fn absolute_url(href: &str, base: &str) -> String { + if href.starts_with("http://") || href.starts_with("https://") { + href.to_string() + } else if href.starts_with("//") { + format!("https:{}", href) + } else if href.starts_with('/') { + format!("{}{}", base.trim_end_matches('/'), href) + } else { + format!("{}/{}", base.trim_end_matches('/'), href) + } +} + +pub fn pick_image_source(value: &str) -> String { + if value.is_empty() { + return String::new(); + } + let first_segment = value.split(',').next().unwrap_or("").trim(); + first_segment + .trim_start_matches("url(\"") + .trim_start_matches("url('") + .trim_start_matches("url(") + .trim_end_matches("\")") + .trim_end_matches("')") + .trim_end_matches(")") + .to_string() +} + +pub async fn scrape_wallpaperflare( + client: &wreq::Client, + limit: usize, + page: u32, + search_query: Option<&str>, +) -> Result, String> { + // println!( + // "[scraper:wallpaperflare] starting scrape - page: {}, limit: {}", + // page, limit + // ); + + let url = if let Some(query) = search_query { + let q = query.replace(" ", "+"); + if page > 1 { + format!("https://www.wallpaperflare.com/search?wallpaper={}&page={}", q, page) + } else { + format!("https://www.wallpaperflare.com/search?wallpaper={}", q) + } + } else { + if page > 1 { + format!("https://www.wallpaperflare.com/index.php?page={}", page) + } else { + "https://www.wallpaperflare.com/".to_string() + } + }; + + // println!("[scraper:wallpaperflare] fetching: {}", url); + + let response = client + .get(&url) + .header("Referer", "https://www.wallpaperflare.com/") + .header("Sec-Fetch-Site", "same-origin") + .timeout(std::time::Duration::from_secs(20)) + .send() + .await + .map_err(|e| format!("request failed: {}", e))?; + + let status = response.status(); + if !status.is_success() { + return Err(format!("http {}", status)); + } + + let html = response.text().await.map_err(|e| e.to_string())?; + + if html.contains("cf-browser-verification") || html.contains("Checking your browser") { + // println!("[scraper:wallpaperflare] cloudflare challenge detected!"); + return Err("cloudflare challenge - browser verification required".to_string()); + } + + let mut temp_items = Vec::new(); + let mut seen_ids = HashSet::new(); + + { + let document = Html::parse_document(&html); + let li_selector = Selector::parse("li[itemprop=\"associatedMedia\"]").unwrap(); + let link_selector = Selector::parse("a[itemprop=\"url\"]").unwrap(); + let img_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap(); + let keywords_selector = Selector::parse("meta[itemprop=\"keywords\"]").unwrap(); + + for li_element in document.select(&li_selector) { + if temp_items.len() >= limit { + break; + } + + let link_element = match li_element.select(&link_selector).next() { + Some(el) => el, + None => continue, + }; + + let href = link_element.value().attr("href").unwrap_or(""); + if href.is_empty() + || href.starts_with('#') + || href.starts_with("/search") + || href.starts_with("/tag") + || href.starts_with("/page") + || href == "/" + || !href.contains("wallpaper") + { + continue; + } + + let normalized_href = absolute_url(href, "https://www.wallpaperflare.com"); + if !normalized_href.to_lowercase().contains("wallpaper") { + continue; + } + + let media_elem = match link_element.select(&img_selector).next() { + Some(el) => el, + None => continue, + }; + let thumb = media_elem + .value() + .attr("data-src") + .or_else(|| media_elem.value().attr("data-original")) + .or_else(|| media_elem.value().attr("data-srcset")) + .or_else(|| media_elem.value().attr("srcset")) + .or_else(|| media_elem.value().attr("src")) + .map(pick_image_source) + .unwrap_or_default(); + + if thumb.is_empty() { + continue; + } + + let id = href + .trim_start_matches('/') + .split('-') + .next_back() + .unwrap_or("") + .to_string(); + + if id.is_empty() || id.len() < 3 || seen_ids.contains(&id) { + continue; + } + seen_ids.insert(id.clone()); + + let thumbnail_url = absolute_url(&thumb, "https://www.wallpaperflare.com"); + let title = media_elem + .value() + .attr("alt") + .or_else(|| media_elem.value().attr("title")) + .unwrap_or("WallpaperFlare Wallpaper") + .to_string(); + + let tags = li_element + .select(&keywords_selector) + .next() + .and_then(|el| el.value().attr("content")) + .map(|content| { + content + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect::>() + }) + .unwrap_or_default(); + + temp_items.push((id, title, thumbnail_url, normalized_href, tags)); + } + } + + if temp_items.is_empty() { + // println!("[scraper:wallpaperflare] no items found"); + return Err("wallpaperflare returned no results".to_string()); + } + + // println!( + // "[scraper:wallpaperflare] collected {} items, resolving download urls...", + // temp_items.len() + // ); + + let mut handles = Vec::new(); + for (id, title, thumb, detail_url, tags) in temp_items { + let client = client.clone(); + let detail = detail_url.clone(); + + handles.push(tokio::spawn(async move { + let download_url = resolve_wallpaperflare_download(&client, &detail).await; + (id, title, thumb, detail_url, download_url, tags) + })); + } + + let mut items = Vec::new(); + for handle in handles { + match handle.await { + Ok((id, title, thumbnail_url, detail_url, download_result, tags)) => { + let download_url = match download_result { + Ok(url) => url, + Err(e) => { + // println!(" [warn] failed to resolve {}: {}", id, e); + thumbnail_url.clone() + } + }; + + items.push(WallpaperEntry { + id: format!("wallpaperflare-{}", id), + title, + thumbnail_url, + detail_url, + download_url, + tags, + }); + } + Err(_e) => { + // println!(" [warn] task failed: {}", e); + } + } + } + + // println!("[scraper:wallpaperflare] resolved {} download urls", items.len()); + Ok(items) +} + +pub async fn resolve_wallpaperflare_download( + client: &wreq::Client, + detail_url: &str, +) -> Result { + let absolute = absolute_url(detail_url, "https://www.wallpaperflare.com"); + let download_page_url = format!("{}/download", absolute.trim_end_matches('/')); + + if let Ok(response) = client + .get(&download_page_url) + .header("Referer", &absolute) + .send() + .await + { + if let Ok(html) = response.text().await { + let document = Html::parse_document(&html); + let show_img_selector = Selector::parse("#show_img").unwrap(); + let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap(); + + let high_res_image = document + .select(&show_img_selector) + .next() + .and_then(|el| el.value().attr("src")) + .or_else(|| { + document + .select(&content_url_selector) + .next() + .and_then(|el| el.value().attr("src")) + }); + + if let Some(img_url) = high_res_image { + let final_url = absolute_url(img_url, "https://www.wallpaperflare.com"); + return Ok(final_url); + } + } + } + + match client.get(&absolute) + .header("Referer", "https://www.wallpaperflare.com/") + .send() + .await + { + Ok(response) => { + let html = response.text().await.map_err(|e| e.to_string())?; + let document = Html::parse_document(&html); + let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap(); + let vimg_selector = Selector::parse("#vimg").unwrap(); + let og_image_selector = Selector::parse("meta[property=\"og:image\"]").unwrap(); + + let detail_image = document + .select(&content_url_selector) + .next() + .and_then(|el| el.value().attr("src")) + .map(pick_image_source) + .or_else(|| { + document + .select(&vimg_selector) + .next() + .and_then(|el| el.value().attr("src")) + .map(pick_image_source) + }) + .or_else(|| { + document + .select(&og_image_selector) + .next() + .and_then(|el| el.value().attr("content")) + .map(pick_image_source) + }); + + if let Some(img_url) = detail_image { + let final_url = absolute_url(&img_url, "https://www.wallpaperflare.com"); + return Ok(final_url); + } + + Err("no image found on detail page".to_string()) + } + Err(e) => Err(format!("failed to fetch detail page: {}", e)), + } +} + +pub async fn download_wallpaper(client: &wreq::Client, url: &str, path: &Path) -> Result { + const MAX_FILE_SIZE: u64 = 30 * 1024 * 1024; + + let response = client + .get(url) + .header("Referer", "https://www.wallpaperflare.com/") + .timeout(std::time::Duration::from_secs(60)) + .send() + .await + .map_err(|e| format!("download request failed: {}", e))?; + + if !response.status().is_success() { + return Err(format!("http {}", response.status())); + } + + // bail early using content-length header — no need to download 140 mb of garbage + if let Some(cl) = response.content_length() { + if cl > MAX_FILE_SIZE { + return Err(format!( + "file too large ({:.2} MB, limit is {} MB) — skipping", + cl as f64 / (1024.0 * 1024.0), + MAX_FILE_SIZE / (1024 * 1024) + )); + } + } + + let bytes = response.bytes().await.map_err(|e| e.to_string())?; + let len = bytes.len() as u64; + + // safety net in case content-length header was missing or lied + if len > MAX_FILE_SIZE { + return Err(format!( + "file too large ({:.2} MB, limit is {} MB) — skipping", + len as f64 / (1024.0 * 1024.0), + MAX_FILE_SIZE / (1024 * 1024) + )); + } + + std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?; + + Ok(len) +} diff --git a/src/wallpapersclan.rs b/src/wallpapersclan.rs new file mode 100644 index 0000000..d20e796 --- /dev/null +++ b/src/wallpapersclan.rs @@ -0,0 +1,352 @@ +#![allow(dead_code, unused_variables)] +use scraper::{Html, Selector}; +use std::path::Path; +use wreq_util::Emulation; + +const BASE_URL: &str = "https://wallpapers-clan.com"; +const DESKTOP_URL: &str = "https://wallpapers-clan.com/desktop-wallpapers/"; + +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize)] +pub struct WallpaperEntry { + pub id: String, + pub title: String, + pub thumbnail_url: String, + pub detail_url: String, + pub download_url: String, + pub tags: Vec, +} + +/// build a wreq client impersonating chrome — no cookies needed, +/// the tls fingerprint alone bypasses cloudflare's managed challenge +fn build_client() -> Result { + wreq::Client::builder() + .emulation(Emulation::Chrome134) + .cookie_store(true) + .build() + .map_err(|e| e.to_string()) +} + +/// scrape the listing page and resolve download urls from detail pages +pub async fn scrape_wallpapersclan( + limit: usize, + page: u32, +) -> Result, String> { + let client = build_client()?; + + let (url, response) = if page > 1 { + let ajax_url = format!("{}/wp-admin/admin-ajax.php", BASE_URL); + println!("[listing] fetching (ajax): {} (page {})", ajax_url, page); + + let mut form = std::collections::HashMap::new(); + form.insert("action", "boldlab_get_new_posts"); + form.insert("options[plugin]", "boldlab_core"); + form.insert("options[module]", "post-types/dwallpapers/shortcodes"); + form.insert("options[shortcode]", "dwallpapers-list"); + form.insert("options[post_type]", "dwallpapers"); + let page_str = page.to_string(); + form.insert("options[next_page]", &page_str); + form.insert("options[max_pages_num]", "863"); + form.insert("options[show_category]", "no"); + form.insert("options[behavior]", "columns"); + form.insert("options[images_proportion]", "full"); + form.insert("options[columns]", "3"); + form.insert("options[space]", "normal"); + form.insert("options[columns_responsive]", "predefined"); + form.insert("options[columns_1440]", "3"); + form.insert("options[columns_1366]", "3"); + form.insert("options[columns_1024]", "3"); + form.insert("options[columns_768]", "3"); + form.insert("options[columns_680]", "3"); + form.insert("options[columns_480]", "3"); + form.insert("options[posts_per_page]", "12"); + form.insert("options[orderby]", "date"); + form.insert("options[order]", "DESC"); + form.insert("options[additional_params]", "tax"); + form.insert("options[layout]", "info-below"); + form.insert("options[hover_animation_info-below]", "tilt"); + form.insert("options[hover_animation_info-follow]", "follow"); + form.insert("options[hover_animation_info-on-hover]", "direction-aware"); + form.insert("options[title_tag]", "h4"); + form.insert("options[custom_padding]", "no"); + form.insert("options[enable_filter]", "yes"); + form.insert("options[pagination_type]", "infinite-scroll"); + form.insert("options[loading_animation]", "no"); + form.insert("options[object_class_name]", "BoldlabCoredwallpapersListShortcode"); + form.insert("options[taxonomy_filter]", "dwallpapers-category"); + form.insert("options[space_value]", "15"); + form.insert("options[justified_attr]", "{\"rowHeight\":\"\",\"spaceBetween\":15}"); + + let resp = client + .post(&ajax_url) + .header("Referer", DESKTOP_URL) + .header("X-Requested-With", "XMLHttpRequest") + .form(&form) + .timeout(std::time::Duration::from_secs(20)) + .send() + .await + .map_err(|e| format!("ajax request failed: {}", e))?; + + (ajax_url, resp) + } else { + println!("[listing] fetching: {}", DESKTOP_URL); + let resp = client + .get(DESKTOP_URL) + .header("Referer", BASE_URL) + .timeout(std::time::Duration::from_secs(20)) + .send() + .await + .map_err(|e| format!("request failed: {}", e))?; + (DESKTOP_URL.to_string(), resp) + }; + + let status = response.status(); + println!("[listing] http {}", status); + + if !status.is_success() { + return Err(format!("HTTP {} from wallpapers-clan", status)); + } + + let raw_text = response.text().await.map_err(|e| e.to_string())?; + + let html = if page > 1 { + println!("[listing] raw response: {:.200}", raw_text); + // the ajax response is JSON with a "data" string containing the HTML + let json: serde_json::Value = serde_json::from_str(&raw_text) + .map_err(|e| format!("failed to parse ajax json: {}", e))?; + json["data"] + .as_str() + .unwrap_or("") + .to_string() + } else { + raw_text + }; + + // first pass: collect listing data + let mut listing_items: Vec<(String, String, String, Vec)> = Vec::new(); + + { + let document = Html::parse_document(&html); + + // selectors for the qodef grid layout + let article_selector = Selector::parse("article.qodef-grid-item").unwrap(); + let media_link_selector = Selector::parse(".qodef-e-media-image a[itemprop='url']").unwrap(); + let img_selector = Selector::parse("img.wp-post-image").unwrap(); + let noscript_selector = Selector::parse("noscript").unwrap(); + let title_selector = Selector::parse("h4.qodef-e-title a.qodef-e-title-link").unwrap(); + let category_selector = Selector::parse(".qodef-e-info-category a.qodef-e-category").unwrap(); + + let articles: Vec<_> = document.select(&article_selector).collect(); + println!("[listing] found {} articles", articles.len()); + + for article in articles.iter() { + if listing_items.len() >= limit { + break; + } + + // detail page url + let detail_url = match article.select(&media_link_selector).next() { + Some(a) => match a.value().attr("href") { + Some(href) if href.contains("desktop-wallpapers") => href.to_string(), + _ => continue, + }, + None => continue, + }; + + // thumbnail — data-lazy-src > data-lazy-srcset > noscript fallback + let thumbnail_url = article + .select(&img_selector) + .next() + .and_then(|img| { + if let Some(src) = img.value().attr("data-lazy-src") { + if !src.contains("data:image/svg") { + return Some(src.to_string()); + } + } + if let Some(srcset) = img.value().attr("data-lazy-srcset") { + if let Some(first) = srcset.split(',').next() { + let url = first.trim().split_whitespace().next().unwrap_or(""); + if !url.is_empty() && !url.contains("data:image/svg") { + return Some(url.to_string()); + } + } + } + if let Some(s) = img.value().attr("src") { + if !s.contains("data:image/svg") { + return Some(s.to_string()); + } + } + None + }) + .or_else(|| { + article.select(&noscript_selector).next().and_then(|ns| { + let inner = ns.inner_html(); + let frag = Html::parse_fragment(&inner); + let img_sel = Selector::parse("img").unwrap(); + frag.select(&img_sel).next().and_then(|img| { + img.value() + .attr("src") + .filter(|s| !s.contains("data:image/svg")) + .map(|s| s.to_string()) + }) + }) + }); + + let thumbnail_url = match thumbnail_url { + Some(url) => url, + None => continue, + }; + + // title + let title = article + .select(&title_selector) + .next() + .map(|t| t.text().collect::().trim().to_string()) + .unwrap_or_else(|| "Untitled".to_string()); + + // tags from categories + let tags: Vec = article + .select(&category_selector) + .map(|cat| cat.text().collect::().trim().to_string()) + .filter(|t| !t.is_empty()) + .collect(); + + listing_items.push((detail_url, thumbnail_url, title, tags)); + } + } + + println!("[listing] collected {} items, resolving download urls...", listing_items.len()); + + // second pass: resolve download urls from detail pages concurrently + let mut handles = Vec::new(); + for (detail_url, thumb, title, tags) in listing_items { + let client = client.clone(); + let detail = detail_url.clone(); + handles.push(tokio::spawn(async move { + let download_url = resolve_download(&client, &detail).await; + (detail_url, thumb, title, tags, download_url) + })); + } + + let mut items = Vec::new(); + for handle in handles { + match handle.await { + Ok((detail_url, thumbnail_url, title, tags, download_result)) => { + // slug for id + let slug = detail_url + .trim_end_matches('/') + .split('/') + .next_back() + .unwrap_or("unknown") + .to_string(); + + let download_url = match download_result { + Ok(url) => url, + Err(e) => { + println!(" [warn] failed to resolve {}: {}", slug, e); + // fallback to thumbnail as download + thumbnail_url.clone() + } + }; + + items.push(WallpaperEntry { + id: slug, + title, + thumbnail_url, + detail_url, + download_url, + tags, + }); + } + Err(e) => { + println!(" [warn] task failed: {}", e); + } + } + } + + println!("[listing] resolved {} download urls", items.len()); + Ok(items) +} + +/// resolve the actual download url from a detail page +/// grabs a.wpdm-download-link[data-downloadurl] — baked in by wordpress +async fn resolve_download(client: &wreq::Client, detail_url: &str) -> Result { + let response = client + .get(detail_url) + .header("Referer", DESKTOP_URL) + .timeout(std::time::Duration::from_secs(15)) + .send() + .await + .map_err(|e| format!("request failed: {}", e))?; + + if !response.status().is_success() { + return Err(format!("HTTP {}", response.status())); + } + + let html = response.text().await.map_err(|e| e.to_string())?; + let document = Html::parse_document(&html); + + // primary: wpdm download button with data-downloadurl + let download_btn = Selector::parse("a.wpdm-download-link").unwrap(); + if let Some(btn) = document.select(&download_btn).next() { + if let Some(url) = btn.value().attr("data-downloadurl") { + if !url.is_empty() { + return Ok(url.to_string()); + } + } + } + + // fallback: any download link + let fallback = Selector::parse(".media-body a[href*='download']").unwrap(); + if let Some(link) = document.select(&fallback).next() { + if let Some(href) = link.value().attr("href") { + return Ok(href.to_string()); + } + } + + // last resort: full-res image on the page + let img_sel = Selector::parse("img.wp-post-image").unwrap(); + if let Some(img) = document.select(&img_sel).next() { + let src = img + .value() + .attr("data-lazy-src") + .or_else(|| { + img.value().attr("data-lazy-srcset").and_then(|srcset| { + srcset.split(',').next().and_then(|s| s.trim().split_whitespace().next()) + }) + }) + .or_else(|| img.value().attr("src")) + .filter(|s| !s.contains("data:image/svg")); + + if let Some(url) = src { + return Ok(url.to_string()); + } + } + + Err("no download url found".to_string()) +} + +/// download a file to disk, returns bytes written +pub async fn download_wallpaper(url: &str, path: &Path) -> Result { + let client = build_client()?; + + let response = client + .get(url) + .header("Referer", DESKTOP_URL) + .timeout(std::time::Duration::from_secs(60)) + .send() + .await + .map_err(|e| format!("download request failed: {}", e))?; + + if !response.status().is_success() { + return Err(format!("HTTP {}", response.status())); + } + + let bytes = response.bytes().await.map_err(|e| e.to_string())?; + let len = bytes.len() as u64; + + std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?; + + Ok(len) +}