This commit is contained in:
Faynot
2026-03-27 20:59:30 +03:00
commit bbd0e487c5
5 changed files with 310 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
.env

24
README.md Normal file
View File

@@ -0,0 +1,24 @@
Вот более аккуратная и читаемая версия `README.md` с таблицей TODO:
# QWORK
---
Бот, который парсит вакансии с **kwork.com** и отправляет в Telegram самые интересные предложения.
## 🚀 Текущий статус
На данный момент реализован только модуль **веб-скраппинга**:
- сбор вакансий с Kwork
- базовая обработка данных
## ✅ TODO
| Статус | Задача |
| ------ | ------------------------------------------- |
| ⏳ | Реализовать отправку уведомлений в Telegram |
| ⏳ | Добавить фильтрацию по ключевым словам |
| ⏳ | Интегрировать нейросеть для оценки вакансий |
| ⏳ | Сделать возможность отклика на вакансии |
| ⏳ | Реализовать авторизацию в Kwork |

61
bot.py Normal file
View File

@@ -0,0 +1,61 @@
import asyncio
import logging
import sys
from os import getenv
from aiogram import Bot, Dispatcher, html
from aiogram.client.default import DefaultBotProperties
from aiogram.enums import ParseMode
from aiogram.filters import CommandStart
from aiogram.types import Message
from dotenv import load_dotenv
load_dotenv()
# Bot token can be obtained via https://t.me/BotFather
TOKEN = getenv("BOT_TOKEN")
# All handlers should be attached to the Router (or Dispatcher)
dp = Dispatcher()
@dp.message(CommandStart())
async def command_start_handler(message: Message) -> None:
"""
This handler receives messages with `/start` command
"""
# Most event objects have aliases for API methods that can be called in events' context
# For example if you want to answer to incoming message you can use `message.answer(...)` alias
# and the target chat will be passed to :ref:`aiogram.methods.send_message.SendMessage`
# method automatically or call API method directly via
# Bot instance: `bot.send_message(chat_id=message.chat.id, ...)`
await message.answer(f"Hello, {html.bold(message.from_user.full_name)}!")
@dp.message()
async def echo_handler(message: Message) -> None:
"""
Handler will forward receive a message back to the sender
By default, message handler will handle all message types (like a text, photo, sticker etc.)
"""
try:
# Send a copy of the received message
await message.send_copy(chat_id=message.chat.id)
except TypeError:
# But not all the types is supported to be copied so need to handle it
await message.answer("Nice try!")
async def main() -> None:
# Initialize Bot instance with default bot properties which will be passed to all API calls
bot = Bot(token=TOKEN, default=DefaultBotProperties(parse_mode=ParseMode.HTML))
# And the run events dispatching
await dp.start_polling(bot)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
asyncio.run(main())

220
main.py Normal file
View File

@@ -0,0 +1,220 @@
import asyncio
import json
import re
from typing import Awaitable, Callable, Optional
from playwright.async_api import Locator, Page, async_playwright
from playwright_stealth import Stealth
BASE_URL = "https://kwork.ru"
PROJECTS_URL = f"{BASE_URL}/projects?c=11"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
VIEWPORT = {"width": 1920, "height": 1080}
Project = dict[str, str]
Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]]
def normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text).strip()
def first_line(text: str) -> str:
text = text.strip()
if not text:
return ""
return text.splitlines()[0].strip()
def normalize_url(href: str) -> str:
return f"{BASE_URL}{href}" if href.startswith("/") else href
async def safe_inner_text(locator: Locator, default: str = "") -> str:
try:
text = await locator.inner_text(timeout=1500)
text = text.replace("\xa0", " ")
return text.strip()
except Exception:
return default
async def first_text(root: Locator, selectors: list[str], default: str = "") -> str:
for selector in selectors:
try:
loc = root.locator(selector).first
text = await safe_inner_text(loc, "")
text = normalize_text(text)
if text:
return text
except Exception:
continue
return default
async def get_card_root(page: Page, href: str) -> Locator:
card = page.locator(
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]'
)
if await card.count() > 0:
return card.first
card = page.locator(
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]'
)
if await card.count() > 0:
return card.first
return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first
async def extract_price(card: Locator) -> str:
card_text = await safe_inner_text(card, "")
card_text = normalize_text(card_text)
patterns = [
r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)",
r"(Цена до:\s*[\d\s]+₽)",
r"(Допустимый:\s*до\s*[\d\s]+₽)",
]
found: list[str] = []
for pattern in patterns:
match = re.search(pattern, card_text, flags=re.IGNORECASE)
if match:
value = normalize_text(match.group(1))
if value and value not in found:
found.append(value)
if found:
return " | ".join(found)
primary = await first_text(
card,
[
".wants-card__price",
".wants-card__header-right-block .wants-card__price",
"[class*='wants-card__price']",
"[class*='price']",
],
"",
)
higher = await first_text(
card,
[
".wants-card__description-higher-price",
"[class*='description-higher-price']",
],
"",
)
parts = [part for part in [primary, higher] if part]
if parts:
return " | ".join(parts)
return "По договоренности"
async def extract_description(card: Locator) -> str:
description = await first_text(
card,
[
".wants-card__description-text .overflow-hidden .d-inline",
".wants-card__description-text .overflow-hidden",
".wants-card__description-text",
"[class*='description-text']",
],
"",
)
description = description.replace("Показать полностью", "")
description = description.replace("Скрыть", "")
description = description.replace("\xa0", " ")
description = normalize_text(description)
return first_line(description)
async def scrape_items(
*,
url: str,
item_selector: str,
extractor: Extractor,
wait_until: str = "networkidle",
render_delay: float = 3.0,
) -> list[Project]:
async with Stealth().use_async(async_playwright()) as p:
browser = await p.chromium.launch(headless=True)
try:
context = await browser.new_context(
user_agent=USER_AGENT,
viewport=VIEWPORT,
)
page = await context.new_page()
await page.goto(url, wait_until=wait_until)
await asyncio.sleep(render_delay)
result: list[Project] = []
items = await page.locator(item_selector).all()
for item in items:
try:
data = await extractor(page, item)
if data is not None:
result.append(data)
except Exception:
continue
return result
except Exception as e:
print(f"Ошибка: {e}")
return []
finally:
await browser.close()
async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]:
link = title_block.locator("a").first
if await link.count() == 0:
return None
title_text = normalize_text(await safe_inner_text(link, ""))
href = await link.get_attribute("href")
if not href:
return None
card = await get_card_root(page, href)
price = await extract_price(card)
description = await extract_description(card)
return {
"title": title_text,
"price": price,
"url": normalize_url(href),
"description": description,
}
async def get_kwork_projects() -> list[Project]:
print("Загружаем проекты...")
return await scrape_items(
url=PROJECTS_URL,
item_selector=".wants-card__header-title",
extractor=extract_kwork_project,
)
if __name__ == "__main__":
data = asyncio.run(get_kwork_projects())
print(json.dumps(data, ensure_ascii=False, indent=4))

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
aiogram==3.26.0
playwright==1.58.0
playwright_stealth==2.0.2
python-dotenv==1.2.2