356 lines
13 KiB
Plaintext
356 lines
13 KiB
Plaintext
use scraper::{Html, Selector};
|
|
use std::collections::HashSet;
|
|
use std::path::Path;
|
|
use wreq_util::Emulation;
|
|
|
|
use crate::wallpapersclan::WallpaperEntry;
|
|
|
|
fn build_client() -> Result<wreq::Client, String> {
|
|
use wreq::header::{HeaderMap, HeaderValue};
|
|
let mut headers = HeaderMap::new();
|
|
headers.insert("accept", HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"));
|
|
headers.insert("accept-encoding", HeaderValue::from_static("gzip, deflate, br, zstd"));
|
|
headers.insert("accept-language", HeaderValue::from_static("en-US,en;q=0.9,hi;q=0.8,de;q=0.7,ja;q=0.6"));
|
|
headers.insert("cache-control", HeaderValue::from_static("max-age=0"));
|
|
headers.insert("dnt", HeaderValue::from_static("1"));
|
|
headers.insert("priority", HeaderValue::from_static("u=0, i"));
|
|
headers.insert("sec-ch-ua", HeaderValue::from_static(r#""Chromium";v="148", "Google Chrome";v="148", "Not/A)Brand";v="99""#));
|
|
headers.insert("sec-ch-ua-mobile", HeaderValue::from_static("?0"));
|
|
headers.insert("sec-ch-ua-platform", HeaderValue::from_static(r#""Windows""#));
|
|
headers.insert("sec-fetch-dest", HeaderValue::from_static("document"));
|
|
headers.insert("sec-fetch-mode", HeaderValue::from_static("navigate"));
|
|
headers.insert("sec-fetch-site", HeaderValue::from_static("same-origin"));
|
|
headers.insert("sec-fetch-user", HeaderValue::from_static("?1"));
|
|
headers.insert("upgrade-insecure-requests", HeaderValue::from_static("1"));
|
|
headers.insert("user-agent", HeaderValue::from_static("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36"));
|
|
|
|
wreq::Client::builder()
|
|
.emulation(wreq_util::Emulation::Chrome134)
|
|
.default_headers(headers)
|
|
.cookie_store(true)
|
|
.build()
|
|
.map_err(|e| e.to_string())
|
|
}
|
|
|
|
// url normalization
|
|
pub fn absolute_url(href: &str, base: &str) -> String {
|
|
if href.starts_with("http://") || href.starts_with("https://") {
|
|
href.to_string()
|
|
} else if href.starts_with("//") {
|
|
format!("https:{}", href)
|
|
} else if href.starts_with('/') {
|
|
format!("{}{}", base.trim_end_matches('/'), href)
|
|
} else {
|
|
format!("{}/{}", base.trim_end_matches('/'), href)
|
|
}
|
|
}
|
|
|
|
pub fn pick_image_source(value: &str) -> String {
|
|
if value.is_empty() {
|
|
return String::new();
|
|
}
|
|
let first_segment = value.split(',').next().unwrap_or("").trim();
|
|
first_segment
|
|
.trim_start_matches("url(\"")
|
|
.trim_start_matches("url('")
|
|
.trim_start_matches("url(")
|
|
.trim_end_matches("\")")
|
|
.trim_end_matches("')")
|
|
.trim_end_matches(")")
|
|
.to_string()
|
|
}
|
|
|
|
pub async fn scrape_wallpaperflare(
|
|
limit: usize,
|
|
page: u32,
|
|
) -> Result<Vec<WallpaperEntry>, String> {
|
|
println!(
|
|
"[scraper:wallpaperflare] starting scrape - page: {}, limit: {}",
|
|
page, limit
|
|
);
|
|
|
|
let client = build_client()?;
|
|
|
|
let url = if page > 1 {
|
|
format!(
|
|
"https://www.wallpaperflare.com/index.php?page={}",
|
|
page
|
|
)
|
|
} else {
|
|
"https://www.wallpaperflare.com/".to_string()
|
|
};
|
|
|
|
println!("[scraper:wallpaperflare] fetching: {}", url);
|
|
|
|
let response = client
|
|
.get(&url)
|
|
.header("Referer", "https://www.wallpaperflare.com/")
|
|
.header("Sec-Fetch-Site", "same-origin")
|
|
.timeout(std::time::Duration::from_secs(20))
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("request failed: {}", e))?;
|
|
|
|
let status = response.status();
|
|
if !status.is_success() {
|
|
return Err(format!("http {} - possibly cloudflare blocked", status));
|
|
}
|
|
|
|
let html = response.text().await.map_err(|e| e.to_string())?;
|
|
|
|
if html.contains("cf-browser-verification") || html.contains("Checking your browser") {
|
|
println!("[scraper:wallpaperflare] cloudflare challenge detected!");
|
|
return Err("cloudflare challenge - browser verification required".to_string());
|
|
}
|
|
|
|
let mut temp_items = Vec::new();
|
|
let mut seen_ids = HashSet::new();
|
|
|
|
{
|
|
let document = Html::parse_document(&html);
|
|
let li_selector = Selector::parse("li[itemprop=\"associatedMedia\"]").unwrap();
|
|
let link_selector = Selector::parse("a[itemprop=\"url\"]").unwrap();
|
|
let img_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
|
let keywords_selector = Selector::parse("meta[itemprop=\"keywords\"]").unwrap();
|
|
|
|
for li_element in document.select(&li_selector) {
|
|
if temp_items.len() >= limit {
|
|
break;
|
|
}
|
|
|
|
let link_element = match li_element.select(&link_selector).next() {
|
|
Some(el) => el,
|
|
None => continue,
|
|
};
|
|
|
|
let href = link_element.value().attr("href").unwrap_or("");
|
|
if href.is_empty()
|
|
|| href.starts_with('#')
|
|
|| href.starts_with("/search")
|
|
|| href.starts_with("/tag")
|
|
|| href.starts_with("/page")
|
|
|| href == "/"
|
|
|| !href.contains("wallpaper")
|
|
{
|
|
continue;
|
|
}
|
|
|
|
let normalized_href = absolute_url(href, "https://www.wallpaperflare.com");
|
|
if !normalized_href.to_lowercase().contains("wallpaper") {
|
|
continue;
|
|
}
|
|
|
|
let media_elem = match link_element.select(&img_selector).next() {
|
|
Some(el) => el,
|
|
None => continue,
|
|
};
|
|
let thumb = media_elem
|
|
.value()
|
|
.attr("data-src")
|
|
.or_else(|| media_elem.value().attr("data-original"))
|
|
.or_else(|| media_elem.value().attr("data-srcset"))
|
|
.or_else(|| media_elem.value().attr("srcset"))
|
|
.or_else(|| media_elem.value().attr("src"))
|
|
.map(pick_image_source)
|
|
.unwrap_or_default();
|
|
|
|
if thumb.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let id = href
|
|
.trim_start_matches('/')
|
|
.split('-')
|
|
.next_back()
|
|
.unwrap_or("")
|
|
.to_string();
|
|
|
|
if id.is_empty() || id.len() < 3 || seen_ids.contains(&id) {
|
|
continue;
|
|
}
|
|
seen_ids.insert(id.clone());
|
|
|
|
let thumbnail_url = absolute_url(&thumb, "https://www.wallpaperflare.com");
|
|
let title = media_elem
|
|
.value()
|
|
.attr("alt")
|
|
.or_else(|| media_elem.value().attr("title"))
|
|
.unwrap_or("WallpaperFlare Wallpaper")
|
|
.to_string();
|
|
|
|
let tags = li_element
|
|
.select(&keywords_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("content"))
|
|
.map(|content| {
|
|
content
|
|
.split(',')
|
|
.map(|s| s.trim().to_string())
|
|
.filter(|s| !s.is_empty())
|
|
.collect::<Vec<String>>()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
temp_items.push((id, title, thumbnail_url, normalized_href, tags));
|
|
}
|
|
}
|
|
|
|
if temp_items.is_empty() {
|
|
println!("[scraper:wallpaperflare] no items found");
|
|
return Err("wallpaperflare returned no results".to_string());
|
|
}
|
|
|
|
println!(
|
|
"[scraper:wallpaperflare] collected {} items, resolving download urls...",
|
|
temp_items.len()
|
|
);
|
|
|
|
let mut handles = Vec::new();
|
|
let mut delay_ms = 0;
|
|
for (id, title, thumb, detail_url, tags) in temp_items {
|
|
let client = client.clone();
|
|
let detail = detail_url.clone();
|
|
|
|
delay_ms += 1000;
|
|
let delay = std::time::Duration::from_millis(delay_ms);
|
|
|
|
handles.push(tokio::spawn(async move {
|
|
tokio::time::sleep(delay).await;
|
|
let download_url = resolve_wallpaperflare_download(&client, &detail).await;
|
|
(id, title, thumb, detail_url, download_url, tags)
|
|
}));
|
|
}
|
|
|
|
let mut items = Vec::new();
|
|
for handle in handles {
|
|
match handle.await {
|
|
Ok((id, title, thumbnail_url, detail_url, download_result, tags)) => {
|
|
let download_url = match download_result {
|
|
Ok(url) => url,
|
|
Err(e) => {
|
|
println!(" [warn] failed to resolve {}: {}", id, e);
|
|
thumbnail_url.clone()
|
|
}
|
|
};
|
|
|
|
items.push(WallpaperEntry {
|
|
id: format!("wallpaperflare-{}", id),
|
|
title,
|
|
thumbnail_url,
|
|
detail_url,
|
|
download_url,
|
|
tags,
|
|
});
|
|
}
|
|
Err(e) => {
|
|
println!(" [warn] task failed: {}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
println!("[scraper:wallpaperflare] resolved {} download urls", items.len());
|
|
Ok(items)
|
|
}
|
|
|
|
pub async fn resolve_wallpaperflare_download(
|
|
client: &wreq::Client,
|
|
detail_url: &str,
|
|
) -> Result<String, String> {
|
|
let absolute = absolute_url(detail_url, "https://www.wallpaperflare.com");
|
|
let download_page_url = format!("{}/download", absolute.trim_end_matches('/'));
|
|
|
|
if let Ok(response) = client
|
|
.get(&download_page_url)
|
|
.header("Referer", &absolute)
|
|
.send()
|
|
.await
|
|
{
|
|
if let Ok(html) = response.text().await {
|
|
let document = Html::parse_document(&html);
|
|
let show_img_selector = Selector::parse("#show_img").unwrap();
|
|
let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
|
|
|
let high_res_image = document
|
|
.select(&show_img_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("src"))
|
|
.or_else(|| {
|
|
document
|
|
.select(&content_url_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("src"))
|
|
});
|
|
|
|
if let Some(img_url) = high_res_image {
|
|
let final_url = absolute_url(img_url, "https://www.wallpaperflare.com");
|
|
return Ok(final_url);
|
|
}
|
|
}
|
|
}
|
|
|
|
match client.get(&absolute)
|
|
.header("Referer", "https://www.wallpaperflare.com/")
|
|
.send()
|
|
.await
|
|
{
|
|
Ok(response) => {
|
|
let html = response.text().await.map_err(|e| e.to_string())?;
|
|
let document = Html::parse_document(&html);
|
|
let content_url_selector = Selector::parse("img[itemprop=\"contentUrl\"]").unwrap();
|
|
let vimg_selector = Selector::parse("#vimg").unwrap();
|
|
let og_image_selector = Selector::parse("meta[property=\"og:image\"]").unwrap();
|
|
|
|
let detail_image = document
|
|
.select(&content_url_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("src"))
|
|
.map(pick_image_source)
|
|
.or_else(|| {
|
|
document
|
|
.select(&vimg_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("src"))
|
|
.map(pick_image_source)
|
|
})
|
|
.or_else(|| {
|
|
document
|
|
.select(&og_image_selector)
|
|
.next()
|
|
.and_then(|el| el.value().attr("content"))
|
|
.map(pick_image_source)
|
|
});
|
|
|
|
if let Some(img_url) = detail_image {
|
|
let final_url = absolute_url(&img_url, "https://www.wallpaperflare.com");
|
|
return Ok(final_url);
|
|
}
|
|
|
|
Err("no image found on detail page".to_string())
|
|
}
|
|
Err(e) => Err(format!("failed to fetch detail page: {}", e)),
|
|
}
|
|
}
|
|
|
|
pub async fn download_wallpaper(url: &str, path: &Path) -> Result<u64, String> {
|
|
let client = build_client()?;
|
|
|
|
let response = client
|
|
.get(url)
|
|
.header("Referer", "https://www.wallpaperflare.com/")
|
|
.timeout(std::time::Duration::from_secs(60))
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("download request failed: {}", e))?;
|
|
|
|
if !response.status().is_success() {
|
|
return Err(format!("http {}", response.status()));
|
|
}
|
|
|
|
let bytes = response.bytes().await.map_err(|e| e.to_string())?;
|
|
let len = bytes.len() as u64;
|
|
|
|
std::fs::write(path, &bytes).map_err(|e| format!("write failed: {}", e))?;
|
|
|
|
Ok(len)
|
|
}
|