import asyncio import json import re from typing import Awaitable, Callable, Optional from playwright.async_api import Locator, Page, async_playwright from playwright_stealth import Stealth BASE_URL = "https://kwork.ru" PROJECTS_URL = f"{BASE_URL}/projects?c=11" USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) VIEWPORT = {"width": 1920, "height": 1080} Project = dict[str, str] Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]] def normalize_text(text: str) -> str: return re.sub(r"\s+", " ", text).strip() def first_line(text: str) -> str: text = text.strip() if not text: return "" return text.splitlines()[0].strip() def normalize_url(href: str) -> str: return f"{BASE_URL}{href}" if href.startswith("/") else href async def safe_inner_text(locator: Locator, default: str = "") -> str: try: text = await locator.inner_text(timeout=1500) text = text.replace("\xa0", " ") return text.strip() except Exception: return default async def first_text(root: Locator, selectors: list[str], default: str = "") -> str: for selector in selectors: try: loc = root.locator(selector).first text = await safe_inner_text(loc, "") text = normalize_text(text) if text: return text except Exception: continue return default async def get_card_root(page: Page, href: str) -> Locator: card = page.locator( f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]' ) if await card.count() > 0: return card.first card = page.locator( f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]' ) if await card.count() > 0: return card.first return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first async def extract_price(card: Locator) -> str: card_text = await safe_inner_text(card, "") card_text = normalize_text(card_text) patterns = [ r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)", r"(Цена до:\s*[\d\s]+₽)", r"(Допустимый:\s*до\s*[\d\s]+₽)", ] found: list[str] = [] for pattern in patterns: match = re.search(pattern, card_text, flags=re.IGNORECASE) if match: value = normalize_text(match.group(1)) if value and value not in found: found.append(value) if found: return " | ".join(found) primary = await first_text( card, [ ".wants-card__price", ".wants-card__header-right-block .wants-card__price", "[class*='wants-card__price']", "[class*='price']", ], "", ) higher = await first_text( card, [ ".wants-card__description-higher-price", "[class*='description-higher-price']", ], "", ) parts = [part for part in [primary, higher] if part] if parts: return " | ".join(parts) return "По договоренности" async def extract_description(card: Locator) -> str: description = await first_text( card, [ ".wants-card__description-text .overflow-hidden .d-inline", ".wants-card__description-text .overflow-hidden", ".wants-card__description-text", "[class*='description-text']", ], "", ) description = description.replace("Показать полностью", "") description = description.replace("Скрыть", "") description = description.replace("\xa0", " ") description = normalize_text(description) return first_line(description) async def scrape_items( *, url: str, item_selector: str, extractor: Extractor, wait_until: str = "networkidle", render_delay: float = 3.0, ) -> list[Project]: async with Stealth().use_async(async_playwright()) as p: browser = await p.chromium.launch(headless=True) try: context = await browser.new_context( user_agent=USER_AGENT, viewport=VIEWPORT, ) page = await context.new_page() await page.goto(url, wait_until=wait_until) await asyncio.sleep(render_delay) result: list[Project] = [] items = await page.locator(item_selector).all() for item in items: try: data = await extractor(page, item) if data is not None: result.append(data) except Exception: continue return result except Exception as e: print(f"Ошибка: {e}") return [] finally: await browser.close() async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]: link = title_block.locator("a").first if await link.count() == 0: return None title_text = normalize_text(await safe_inner_text(link, "")) href = await link.get_attribute("href") if not href: return None card = await get_card_root(page, href) price = await extract_price(card) description = await extract_description(card) return { "title": title_text, "price": price, "url": normalize_url(href), "description": description, } async def get_kwork_projects() -> list[Project]: print("Загружаем проекты...") return await scrape_items( url=PROJECTS_URL, item_selector=".wants-card__header-title", extractor=extract_kwork_project, ) if __name__ == "__main__": data = asyncio.run(get_kwork_projects()) print(json.dumps(data, ensure_ascii=False, indent=4))