commit bbd0e487c58656b274c48e677b2b2d2b9988ab6d Author: Faynot Date: Fri Mar 27 20:59:30 2026 +0300 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c49bd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..6c1117a --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +Вот более аккуратная и читаемая версия `README.md` с таблицей TODO: + +# QWORK + +--- + +Бот, который парсит вакансии с **kwork.com** и отправляет в Telegram самые интересные предложения. + +## 🚀 Текущий статус + +На данный момент реализован только модуль **веб-скраппинга**: + +- сбор вакансий с Kwork +- базовая обработка данных + +## ✅ TODO + +| Статус | Задача | +| ------ | ------------------------------------------- | +| ⏳ | Реализовать отправку уведомлений в Telegram | +| ⏳ | Добавить фильтрацию по ключевым словам | +| ⏳ | Интегрировать нейросеть для оценки вакансий | +| ⏳ | Сделать возможность отклика на вакансии | +| ⏳ | Реализовать авторизацию в Kwork | diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..d6db686 --- /dev/null +++ b/bot.py @@ -0,0 +1,61 @@ +import asyncio +import logging +import sys +from os import getenv + +from aiogram import Bot, Dispatcher, html +from aiogram.client.default import DefaultBotProperties +from aiogram.enums import ParseMode +from aiogram.filters import CommandStart +from aiogram.types import Message +from dotenv import load_dotenv + +load_dotenv() + +# Bot token can be obtained via https://t.me/BotFather +TOKEN = getenv("BOT_TOKEN") + +# All handlers should be attached to the Router (or Dispatcher) + +dp = Dispatcher() + + +@dp.message(CommandStart()) +async def command_start_handler(message: Message) -> None: + """ + This handler receives messages with `/start` command + """ + # Most event objects have aliases for API methods that can be called in events' context + # For example if you want to answer to incoming message you can use `message.answer(...)` alias + # and the target chat will be passed to :ref:`aiogram.methods.send_message.SendMessage` + # method automatically or call API method directly via + # Bot instance: `bot.send_message(chat_id=message.chat.id, ...)` + await message.answer(f"Hello, {html.bold(message.from_user.full_name)}!") + + +@dp.message() +async def echo_handler(message: Message) -> None: + """ + Handler will forward receive a message back to the sender + + By default, message handler will handle all message types (like a text, photo, sticker etc.) + """ + try: + # Send a copy of the received message + await message.send_copy(chat_id=message.chat.id) + except TypeError: + # But not all the types is supported to be copied so need to handle it + await message.answer("Nice try!") + + +async def main() -> None: + # Initialize Bot instance with default bot properties which will be passed to all API calls + bot = Bot(token=TOKEN, default=DefaultBotProperties(parse_mode=ParseMode.HTML)) + + # And the run events dispatching + await dp.start_polling(bot) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, stream=sys.stdout) + asyncio.run(main()) diff --git a/main.py b/main.py new file mode 100644 index 0000000..54ea45a --- /dev/null +++ b/main.py @@ -0,0 +1,220 @@ +import asyncio +import json +import re +from typing import Awaitable, Callable, Optional + +from playwright.async_api import Locator, Page, async_playwright +from playwright_stealth import Stealth + +BASE_URL = "https://kwork.ru" +PROJECTS_URL = f"{BASE_URL}/projects?c=11" + +USER_AGENT = ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" +) + +VIEWPORT = {"width": 1920, "height": 1080} + +Project = dict[str, str] +Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]] + + +def normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", text).strip() + + +def first_line(text: str) -> str: + text = text.strip() + if not text: + return "" + return text.splitlines()[0].strip() + + +def normalize_url(href: str) -> str: + return f"{BASE_URL}{href}" if href.startswith("/") else href + + +async def safe_inner_text(locator: Locator, default: str = "") -> str: + try: + text = await locator.inner_text(timeout=1500) + text = text.replace("\xa0", " ") + return text.strip() + except Exception: + return default + + +async def first_text(root: Locator, selectors: list[str], default: str = "") -> str: + for selector in selectors: + try: + loc = root.locator(selector).first + text = await safe_inner_text(loc, "") + text = normalize_text(text) + if text: + return text + except Exception: + continue + return default + + +async def get_card_root(page: Page, href: str) -> Locator: + card = page.locator( + f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]' + ) + + if await card.count() > 0: + return card.first + + card = page.locator( + f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]' + ) + + if await card.count() > 0: + return card.first + + return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first + + +async def extract_price(card: Locator) -> str: + card_text = await safe_inner_text(card, "") + card_text = normalize_text(card_text) + + patterns = [ + r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)", + r"(Цена до:\s*[\d\s]+₽)", + r"(Допустимый:\s*до\s*[\d\s]+₽)", + ] + + found: list[str] = [] + for pattern in patterns: + match = re.search(pattern, card_text, flags=re.IGNORECASE) + if match: + value = normalize_text(match.group(1)) + if value and value not in found: + found.append(value) + + if found: + return " | ".join(found) + + primary = await first_text( + card, + [ + ".wants-card__price", + ".wants-card__header-right-block .wants-card__price", + "[class*='wants-card__price']", + "[class*='price']", + ], + "", + ) + + higher = await first_text( + card, + [ + ".wants-card__description-higher-price", + "[class*='description-higher-price']", + ], + "", + ) + + parts = [part for part in [primary, higher] if part] + if parts: + return " | ".join(parts) + + return "По договоренности" + + +async def extract_description(card: Locator) -> str: + description = await first_text( + card, + [ + ".wants-card__description-text .overflow-hidden .d-inline", + ".wants-card__description-text .overflow-hidden", + ".wants-card__description-text", + "[class*='description-text']", + ], + "", + ) + + description = description.replace("Показать полностью", "") + description = description.replace("Скрыть", "") + description = description.replace("\xa0", " ") + description = normalize_text(description) + + return first_line(description) + + +async def scrape_items( + *, + url: str, + item_selector: str, + extractor: Extractor, + wait_until: str = "networkidle", + render_delay: float = 3.0, +) -> list[Project]: + async with Stealth().use_async(async_playwright()) as p: + browser = await p.chromium.launch(headless=True) + try: + context = await browser.new_context( + user_agent=USER_AGENT, + viewport=VIEWPORT, + ) + page = await context.new_page() + + await page.goto(url, wait_until=wait_until) + await asyncio.sleep(render_delay) + + result: list[Project] = [] + items = await page.locator(item_selector).all() + + for item in items: + try: + data = await extractor(page, item) + if data is not None: + result.append(data) + except Exception: + continue + + return result + except Exception as e: + print(f"Ошибка: {e}") + return [] + finally: + await browser.close() + + +async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]: + link = title_block.locator("a").first + if await link.count() == 0: + return None + + title_text = normalize_text(await safe_inner_text(link, "")) + href = await link.get_attribute("href") + if not href: + return None + + card = await get_card_root(page, href) + + price = await extract_price(card) + description = await extract_description(card) + + return { + "title": title_text, + "price": price, + "url": normalize_url(href), + "description": description, + } + + +async def get_kwork_projects() -> list[Project]: + print("Загружаем проекты...") + + return await scrape_items( + url=PROJECTS_URL, + item_selector=".wants-card__header-title", + extractor=extract_kwork_project, + ) + + +if __name__ == "__main__": + data = asyncio.run(get_kwork_projects()) + print(json.dumps(data, ensure_ascii=False, indent=4)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..13b88c4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +aiogram==3.26.0 +playwright==1.58.0 +playwright_stealth==2.0.2 +python-dotenv==1.2.2