init
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
.env
|
||||||
24
README.md
Normal file
24
README.md
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
Вот более аккуратная и читаемая версия `README.md` с таблицей TODO:
|
||||||
|
|
||||||
|
# QWORK
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Бот, который парсит вакансии с **kwork.com** и отправляет в Telegram самые интересные предложения.
|
||||||
|
|
||||||
|
## 🚀 Текущий статус
|
||||||
|
|
||||||
|
На данный момент реализован только модуль **веб-скраппинга**:
|
||||||
|
|
||||||
|
- сбор вакансий с Kwork
|
||||||
|
- базовая обработка данных
|
||||||
|
|
||||||
|
## ✅ TODO
|
||||||
|
|
||||||
|
| Статус | Задача |
|
||||||
|
| ------ | ------------------------------------------- |
|
||||||
|
| ⏳ | Реализовать отправку уведомлений в Telegram |
|
||||||
|
| ⏳ | Добавить фильтрацию по ключевым словам |
|
||||||
|
| ⏳ | Интегрировать нейросеть для оценки вакансий |
|
||||||
|
| ⏳ | Сделать возможность отклика на вакансии |
|
||||||
|
| ⏳ | Реализовать авторизацию в Kwork |
|
||||||
61
bot.py
Normal file
61
bot.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from os import getenv
|
||||||
|
|
||||||
|
from aiogram import Bot, Dispatcher, html
|
||||||
|
from aiogram.client.default import DefaultBotProperties
|
||||||
|
from aiogram.enums import ParseMode
|
||||||
|
from aiogram.filters import CommandStart
|
||||||
|
from aiogram.types import Message
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Bot token can be obtained via https://t.me/BotFather
|
||||||
|
TOKEN = getenv("BOT_TOKEN")
|
||||||
|
|
||||||
|
# All handlers should be attached to the Router (or Dispatcher)
|
||||||
|
|
||||||
|
dp = Dispatcher()
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message(CommandStart())
|
||||||
|
async def command_start_handler(message: Message) -> None:
|
||||||
|
"""
|
||||||
|
This handler receives messages with `/start` command
|
||||||
|
"""
|
||||||
|
# Most event objects have aliases for API methods that can be called in events' context
|
||||||
|
# For example if you want to answer to incoming message you can use `message.answer(...)` alias
|
||||||
|
# and the target chat will be passed to :ref:`aiogram.methods.send_message.SendMessage`
|
||||||
|
# method automatically or call API method directly via
|
||||||
|
# Bot instance: `bot.send_message(chat_id=message.chat.id, ...)`
|
||||||
|
await message.answer(f"Hello, {html.bold(message.from_user.full_name)}!")
|
||||||
|
|
||||||
|
|
||||||
|
@dp.message()
|
||||||
|
async def echo_handler(message: Message) -> None:
|
||||||
|
"""
|
||||||
|
Handler will forward receive a message back to the sender
|
||||||
|
|
||||||
|
By default, message handler will handle all message types (like a text, photo, sticker etc.)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Send a copy of the received message
|
||||||
|
await message.send_copy(chat_id=message.chat.id)
|
||||||
|
except TypeError:
|
||||||
|
# But not all the types is supported to be copied so need to handle it
|
||||||
|
await message.answer("Nice try!")
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
# Initialize Bot instance with default bot properties which will be passed to all API calls
|
||||||
|
bot = Bot(token=TOKEN, default=DefaultBotProperties(parse_mode=ParseMode.HTML))
|
||||||
|
|
||||||
|
# And the run events dispatching
|
||||||
|
await dp.start_polling(bot)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
|
||||||
|
asyncio.run(main())
|
||||||
220
main.py
Normal file
220
main.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from typing import Awaitable, Callable, Optional
|
||||||
|
|
||||||
|
from playwright.async_api import Locator, Page, async_playwright
|
||||||
|
from playwright_stealth import Stealth
|
||||||
|
|
||||||
|
BASE_URL = "https://kwork.ru"
|
||||||
|
PROJECTS_URL = f"{BASE_URL}/projects?c=11"
|
||||||
|
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
VIEWPORT = {"width": 1920, "height": 1080}
|
||||||
|
|
||||||
|
Project = dict[str, str]
|
||||||
|
Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def first_line(text: str) -> str:
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
return text.splitlines()[0].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(href: str) -> str:
|
||||||
|
return f"{BASE_URL}{href}" if href.startswith("/") else href
|
||||||
|
|
||||||
|
|
||||||
|
async def safe_inner_text(locator: Locator, default: str = "") -> str:
|
||||||
|
try:
|
||||||
|
text = await locator.inner_text(timeout=1500)
|
||||||
|
text = text.replace("\xa0", " ")
|
||||||
|
return text.strip()
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
async def first_text(root: Locator, selectors: list[str], default: str = "") -> str:
|
||||||
|
for selector in selectors:
|
||||||
|
try:
|
||||||
|
loc = root.locator(selector).first
|
||||||
|
text = await safe_inner_text(loc, "")
|
||||||
|
text = normalize_text(text)
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
async def get_card_root(page: Page, href: str) -> Locator:
|
||||||
|
card = page.locator(
|
||||||
|
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]'
|
||||||
|
)
|
||||||
|
|
||||||
|
if await card.count() > 0:
|
||||||
|
return card.first
|
||||||
|
|
||||||
|
card = page.locator(
|
||||||
|
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]'
|
||||||
|
)
|
||||||
|
|
||||||
|
if await card.count() > 0:
|
||||||
|
return card.first
|
||||||
|
|
||||||
|
return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_price(card: Locator) -> str:
|
||||||
|
card_text = await safe_inner_text(card, "")
|
||||||
|
card_text = normalize_text(card_text)
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)",
|
||||||
|
r"(Цена до:\s*[\d\s]+₽)",
|
||||||
|
r"(Допустимый:\s*до\s*[\d\s]+₽)",
|
||||||
|
]
|
||||||
|
|
||||||
|
found: list[str] = []
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, card_text, flags=re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
value = normalize_text(match.group(1))
|
||||||
|
if value and value not in found:
|
||||||
|
found.append(value)
|
||||||
|
|
||||||
|
if found:
|
||||||
|
return " | ".join(found)
|
||||||
|
|
||||||
|
primary = await first_text(
|
||||||
|
card,
|
||||||
|
[
|
||||||
|
".wants-card__price",
|
||||||
|
".wants-card__header-right-block .wants-card__price",
|
||||||
|
"[class*='wants-card__price']",
|
||||||
|
"[class*='price']",
|
||||||
|
],
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
higher = await first_text(
|
||||||
|
card,
|
||||||
|
[
|
||||||
|
".wants-card__description-higher-price",
|
||||||
|
"[class*='description-higher-price']",
|
||||||
|
],
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = [part for part in [primary, higher] if part]
|
||||||
|
if parts:
|
||||||
|
return " | ".join(parts)
|
||||||
|
|
||||||
|
return "По договоренности"
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_description(card: Locator) -> str:
|
||||||
|
description = await first_text(
|
||||||
|
card,
|
||||||
|
[
|
||||||
|
".wants-card__description-text .overflow-hidden .d-inline",
|
||||||
|
".wants-card__description-text .overflow-hidden",
|
||||||
|
".wants-card__description-text",
|
||||||
|
"[class*='description-text']",
|
||||||
|
],
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
description = description.replace("Показать полностью", "")
|
||||||
|
description = description.replace("Скрыть", "")
|
||||||
|
description = description.replace("\xa0", " ")
|
||||||
|
description = normalize_text(description)
|
||||||
|
|
||||||
|
return first_line(description)
|
||||||
|
|
||||||
|
|
||||||
|
async def scrape_items(
|
||||||
|
*,
|
||||||
|
url: str,
|
||||||
|
item_selector: str,
|
||||||
|
extractor: Extractor,
|
||||||
|
wait_until: str = "networkidle",
|
||||||
|
render_delay: float = 3.0,
|
||||||
|
) -> list[Project]:
|
||||||
|
async with Stealth().use_async(async_playwright()) as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
try:
|
||||||
|
context = await browser.new_context(
|
||||||
|
user_agent=USER_AGENT,
|
||||||
|
viewport=VIEWPORT,
|
||||||
|
)
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
await page.goto(url, wait_until=wait_until)
|
||||||
|
await asyncio.sleep(render_delay)
|
||||||
|
|
||||||
|
result: list[Project] = []
|
||||||
|
items = await page.locator(item_selector).all()
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
try:
|
||||||
|
data = await extractor(page, item)
|
||||||
|
if data is not None:
|
||||||
|
result.append(data)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Ошибка: {e}")
|
||||||
|
return []
|
||||||
|
finally:
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]:
|
||||||
|
link = title_block.locator("a").first
|
||||||
|
if await link.count() == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
title_text = normalize_text(await safe_inner_text(link, ""))
|
||||||
|
href = await link.get_attribute("href")
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
card = await get_card_root(page, href)
|
||||||
|
|
||||||
|
price = await extract_price(card)
|
||||||
|
description = await extract_description(card)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": title_text,
|
||||||
|
"price": price,
|
||||||
|
"url": normalize_url(href),
|
||||||
|
"description": description,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def get_kwork_projects() -> list[Project]:
|
||||||
|
print("Загружаем проекты...")
|
||||||
|
|
||||||
|
return await scrape_items(
|
||||||
|
url=PROJECTS_URL,
|
||||||
|
item_selector=".wants-card__header-title",
|
||||||
|
extractor=extract_kwork_project,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
data = asyncio.run(get_kwork_projects())
|
||||||
|
print(json.dumps(data, ensure_ascii=False, indent=4))
|
||||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
aiogram==3.26.0
|
||||||
|
playwright==1.58.0
|
||||||
|
playwright_stealth==2.0.2
|
||||||
|
python-dotenv==1.2.2
|
||||||
Reference in New Issue
Block a user