wallpapers/src/wallpapersclan.rs

353 lines
13 KiB
Rust
Raw Normal View History

#![allow(dead_code, unused_variables)]
use scraper::{Html, Selector};
use std::path::Path;
use wreq_util::Emulation;
const BASE_URL: &str = "https://wallpapers-clan.com";
const DESKTOP_URL: &str = "https://wallpapers-clan.com/desktop-wallpapers/";
use serde::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
pub struct WallpaperEntry {
pub id: String,
pub title: String,
pub thumbnail_url: String,
pub detail_url: String,
pub download_url: String,
pub tags: Vec<String>,
}
/// build a wreq client impersonating chrome — no cookies needed,
/// the tls fingerprint alone bypasses cloudflare's managed challenge
fn build_client() -> Result<wreq::Client, String> {
wreq::Client::builder()
.emulation(Emulation::Chrome134)
.cookie_store(true)
.build()
.map_err(|e| e.to_string())
}
/// scrape the listing page and resolve download urls from detail pages
pub async fn scrape_wallpapersclan(
limit: usize,
page: u32,
) -> Result<Vec<WallpaperEntry>, String> {
let client = build_client()?;
let (url, response) = if page > 1 {
let ajax_url = format!("{}/wp-admin/admin-ajax.php", BASE_URL);
println!("[listing] fetching (ajax): {} (page {})", ajax_url, page);
let mut form = std::collections::HashMap::new();
form.insert("action", "boldlab_get_new_posts");
form.insert("options[plugin]", "boldlab_core");
form.insert("options[module]", "post-types/dwallpapers/shortcodes");
form.insert("options[shortcode]", "dwallpapers-list");
form.insert("options[post_type]", "dwallpapers");
let page_str = page.to_string();
form.insert("options[next_page]", &page_str);
form.insert("options[max_pages_num]", "863");
form.insert("options[show_category]", "no");
form.insert("options[behavior]", "columns");
form.insert("options[images_proportion]", "full");
form.insert("options[columns]", "3");
form.insert("options[space]", "normal");
form.insert("options[columns_responsive]", "predefined");
form.insert("options[columns_1440]", "3");
form.insert("options[columns_1366]", "3");
form.insert("options[columns_1024]", "3");
form.insert("options[columns_768]", "3");
form.insert("options[columns_680]", "3");
form.insert("options[columns_480]", "3");
form.insert("options[posts_per_page]", "12");
form.insert("options[orderby]", "date");
form.insert("options[order]", "DESC");
form.insert("options[additional_params]", "tax");
form.insert("options[layout]", "info-below");
form.insert("options[hover_animation_info-below]", "tilt");
form.insert("options[hover_animation_info-follow]", "follow");
form.insert("options[hover_animation_info-on-hover]", "direction-aware");
form.insert("options[title_tag]", "h4");
form.insert("options[custom_padding]", "no");
form.insert("options[enable_filter]", "yes");
form.insert("options[pagination_type]", "infinite-scroll");
form.insert("options[loading_animation]", "no");
form.insert("options[object_class_name]", "BoldlabCoredwallpapersListShortcode");
form.insert("options[taxonomy_filter]", "dwallpapers-category");
form.insert("options[space_value]", "15");
form.insert("options[justified_attr]", "{\"rowHeight\":\"\",\"spaceBetween\":15}");
let resp = client
.post(&ajax_url)
.header("Referer", DESKTOP_URL)
.header("X-Requested-With", "XMLHttpRequest")
.form(&form)
.timeout(std::time::Duration::from_secs(20))
.send()
.await
.map_err(|e| format!("ajax request failed: {}", e))?;
(ajax_url, resp)
} else {
println!("[listing] fetching: {}", DESKTOP_URL);
let resp = client
.get(DESKTOP_URL)
.header("Referer", BASE_URL)
.timeout(std::time::Duration::from_secs(20))
.send()
.await
.map_err(|e| format!("request failed: {}", e))?;
(DESKTOP_URL.to_string(), resp)
};
let status = response.status();
println!("[listing] http {}", status);
if !status.is_success() {
return Err(format!("HTTP {} from wallpapers-clan", status));
}
let raw_text = response.text().await.map_err(|e| e.to_string())?;
let html = if page > 1 {
println!("[listing] raw response: {:.200}", raw_text);
// the ajax response is JSON with a "data" string containing the HTML
let json: serde_json::Value = serde_json::from_str(&raw_text)
.map_err(|e| format!("failed to parse ajax json: {}", e))?;
json["data"]
.as_str()
.unwrap_or("")
.to_string()
} else {
raw_text
};
// first pass: collect listing data
let mut listing_items: Vec<(String, String, String, Vec<String>)> = Vec::new();
{
let document = Html::parse_document(&html);
// selectors for the qodef grid layout
let article_selector = Selector::parse("article.qodef-grid-item").unwrap();
let media_link_selector = Selector::parse(".qodef-e-media-image a[itemprop='url']").unwrap();
let img_selector = Selector::parse("img.wp-post-image").unwrap();
let noscript_selector = Selector::parse("noscript").unwrap();
let title_selector = Selector::parse("h4.qodef-e-title a.qodef-e-title-link").unwrap();
let category_selector = Selector::parse(".qodef-e-info-category a.qodef-e-category").unwrap();
let articles: Vec<_> = document.select(&article_selector).collect();
println!("[listing] found {} articles", articles.len());
for article in articles.iter() {
if listing_items.len() >= limit {
break;
}
// detail page url
let detail_url = match article.select(&media_link_selector).next() {
Some(a) => match a.value().attr("href") {
Some(href) if href.contains("desktop-wallpapers") => href.to_string(),
_ => continue,
},
None => continue,
};
// thumbnail — data-lazy-src > data-lazy-srcset > noscript fallback
let thumbnail_url = article
.select(&img_selector)
.next()
.and_then(|img| {
if let Some(src) = img.value().attr("data-lazy-src") {
if !src.contains("data:image/svg") {
return Some(src.to_string());
}
}
if let Some(srcset) = img.value().attr("data-lazy-srcset") {
if let Some(first) = srcset.split(',').next() {
let url = first.trim().split_whitespace().next().unwrap_or("");
if !url.is_empty() && !url.contains("data:image/svg") {
return Some(url.to_string());
}
}
}
if let Some(s) = img.value().attr("src") {
if !s.contains("data:image/svg") {
return Some(s.to_string());
}
}
None
})
.or_else(|| {
article.select(&noscript_selector).next().and_then(|ns| {
let inner = ns.inner_html();
let frag = Html::parse_fragment(&inner);
let img_sel = Selector::parse("img").unwrap();
frag.select(&img_sel).next().and_then(|img| {
img.value()
.attr("src")
.filter(|s| !s.contains("data:image/svg"))
.map(|s| s.to_string())
})
})
});
let thumbnail_url = match thumbnail_url {
Some(url) => url,
None => continue,
};
// title
let title = article
.select(&title_selector)
.next()
.map(|t| t.text().collect::<String>().trim().to_string())
.unwrap_or_else(|| "Untitled".to_string());
// tags from categories
let tags: Vec<String> = article
.select(&category_selector)
.map(|cat| cat.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty())
.collect();
listing_items.push((detail_url, thumbnail_url, title, tags));
}
}
println!("[listing] collected {} items, resolving download urls...", listing_items.len());
// second pass: resolve download urls from detail pages concurrently
let mut handles = Vec::new();
for (detail_url, thumb, title, tags) in listing_items {
let client = client.clone();
let detail = detail_url.clone();
handles.push(tokio::spawn(async move {
let download_url = resolve_download(&client, &detail).await;
(detail_url, thumb, title, tags, download_url)
}));
}
let mut items = Vec::new();
for handle in handles {
match handle.await {
Ok((detail_url, thumbnail_url, title, tags, download_result)) => {
// slug for id
let slug = detail_url
.trim_end_matches('/')
.split('/')
.next_back()
.unwrap_or("unknown")
.to_string();
let download_url = match download_result {
Ok(url) => url,
Err(e) => {
println!(" [warn] failed to resolve {}: {}", slug, e);
// fallback to thumbnail as download
thumbnail_url.clone()
}
};
items.push(WallpaperEntry {
id: slug,
title,
thumbnail_url,
detail_url,
download_url,
tags,
});
}
Err(e) => {
println!(" [warn] task failed: {}", e);
}
}
}
println!("[listing] resolved {} download urls", items.len());
Ok(items)
}
/// resolve the actual download url from a detail page
/// grabs a.wpdm-download-link[data-downloadurl] — baked in by wordpress
async fn resolve_download(client: &wreq::Client, detail_url: &str) -> Result<String, String> {
let response = client
.get(detail_url)
.header("Referer", DESKTOP_URL)
.timeout(std::time::Duration::from_secs(15))
.send()
.await
.map_err(|e| format!("request failed: {}", e))?;
if !response.status().is_success() {
return Err(format!("HTTP {}", response.status()));
}
let html = response.text().await.map_err(|e| e.to_string())?;
let document = Html::parse_document(&html);
// primary: wpdm download button with data-downloadurl
let download_btn = Selector::parse("a.wpdm-download-link").unwrap();
if let Some(btn) = document.select(&download_btn).next() {
if let Some(url) = btn.value().attr("data-downloadurl") {
if !url.is_empty() {
return Ok(url.to_string());
}
}
}
// fallback: any download link
let fallback = Selector::parse(".media-body a[href*='download']").unwrap();
if let Some(link) = document.select(&fallback).next() {
if let Some(href) = link.value().attr("href") {
return Ok(href.to_string());
}
}
// last resort: full-res image on the page
let img_sel = Selector::parse("img.wp-post-image").unwrap();
if let Some(img) = document.select(&img_sel).next() {
let src = img
.value()
.attr("data-lazy-src")
.or_else(|| {
img.value().attr("data-lazy-srcset").and_then(|srcset| {
srcset.split(',').next().and_then(|s| s.trim().split_whitespace().next())
})
})
.or_else(|| img.value().attr("src"))
.filter(|s| !s.contains("data:image/svg"));
if let Some(url) = src {
return Ok(url.to_string());
}
}
Err("no download url found".to_string())
}
/// download a file to disk, returns bytes written
pub async fn download_wallpaper(url: &str, path: &Path) -> Result<u64, String> {
let client = build_client()?;
let response = client
.get(url)
.header("Referer", DESKTOP_URL)
.timeout(std::time::Duration::from_secs(60))
.send()
.await
.map_err(|e| format!("download request failed: {}", e))?;
if !response.status().is_success() {
return Err(format!("HTTP {}", response.status()));
}
let bytes = response.bytes().await.map_err(|e| e.to_string())?;
let len = bytes.len() as u64;
std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?;
Ok(len)
}