Files
qwork/main.py
Faynot bbd0e487c5 init
2026-03-27 20:59:30 +03:00

221 lines
5.9 KiB
Python

import asyncio
import json
import re
from typing import Awaitable, Callable, Optional
from playwright.async_api import Locator, Page, async_playwright
from playwright_stealth import Stealth
BASE_URL = "https://kwork.ru"
PROJECTS_URL = f"{BASE_URL}/projects?c=11"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
VIEWPORT = {"width": 1920, "height": 1080}
Project = dict[str, str]
Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]]
def normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def first_line(text: str) -> str:
text = text.strip()
if not text:
return ""
return text.splitlines()[0].strip()
def normalize_url(href: str) -> str:
return f"{BASE_URL}{href}" if href.startswith("/") else href
async def safe_inner_text(locator: Locator, default: str = "") -> str:
try:
text = await locator.inner_text(timeout=1500)
text = text.replace("\xa0", " ")
return text.strip()
except Exception:
return default
async def first_text(root: Locator, selectors: list[str], default: str = "") -> str:
for selector in selectors:
try:
loc = root.locator(selector).first
text = await safe_inner_text(loc, "")
text = normalize_text(text)
if text:
return text
except Exception:
continue
return default
async def get_card_root(page: Page, href: str) -> Locator:
card = page.locator(
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]'
)
if await card.count() > 0:
return card.first
card = page.locator(
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]'
)
if await card.count() > 0:
return card.first
return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first
async def extract_price(card: Locator) -> str:
card_text = await safe_inner_text(card, "")
card_text = normalize_text(card_text)
patterns = [
r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)",
r"(Цена до:\s*[\d\s]+₽)",
r"(Допустимый:\s*до\s*[\d\s]+₽)",
]
found: list[str] = []
for pattern in patterns:
match = re.search(pattern, card_text, flags=re.IGNORECASE)
if match:
value = normalize_text(match.group(1))
if value and value not in found:
found.append(value)
if found:
return " | ".join(found)
primary = await first_text(
card,
[
".wants-card__price",
".wants-card__header-right-block .wants-card__price",
"[class*='wants-card__price']",
"[class*='price']",
],
"",
)
higher = await first_text(
card,
[
".wants-card__description-higher-price",
"[class*='description-higher-price']",
],
"",
)
parts = [part for part in [primary, higher] if part]
if parts:
return " | ".join(parts)
return "По договоренности"
async def extract_description(card: Locator) -> str:
description = await first_text(
card,
[
".wants-card__description-text .overflow-hidden .d-inline",
".wants-card__description-text .overflow-hidden",
".wants-card__description-text",
"[class*='description-text']",
],
"",
)
description = description.replace("Показать полностью", "")
description = description.replace("Скрыть", "")
description = description.replace("\xa0", " ")
description = normalize_text(description)
return first_line(description)
async def scrape_items(
*,
url: str,
item_selector: str,
extractor: Extractor,
wait_until: str = "networkidle",
render_delay: float = 3.0,
) -> list[Project]:
async with Stealth().use_async(async_playwright()) as p:
browser = await p.chromium.launch(headless=True)
try:
context = await browser.new_context(
user_agent=USER_AGENT,
viewport=VIEWPORT,
)
page = await context.new_page()
await page.goto(url, wait_until=wait_until)
await asyncio.sleep(render_delay)
result: list[Project] = []
items = await page.locator(item_selector).all()
for item in items:
try:
data = await extractor(page, item)
if data is not None:
result.append(data)
except Exception:
continue
return result
except Exception as e:
print(f"Ошибка: {e}")
return []
finally:
await browser.close()
async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]:
link = title_block.locator("a").first
if await link.count() == 0:
return None
title_text = normalize_text(await safe_inner_text(link, ""))
href = await link.get_attribute("href")
if not href:
return None
card = await get_card_root(page, href)
price = await extract_price(card)
description = await extract_description(card)
return {
"title": title_text,
"price": price,
"url": normalize_url(href),
"description": description,
}
async def get_kwork_projects() -> list[Project]:
print("Загружаем проекты...")
return await scrape_items(
url=PROJECTS_URL,
item_selector=".wants-card__header-title",
extractor=extract_kwork_project,
)
if __name__ == "__main__":
data = asyncio.run(get_kwork_projects())
print(json.dumps(data, ensure_ascii=False, indent=4))