init
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
.env
|
||||
24
README.md
Normal file
24
README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
Вот более аккуратная и читаемая версия `README.md` с таблицей TODO:
|
||||
|
||||
# QWORK
|
||||
|
||||
---
|
||||
|
||||
Бот, который парсит вакансии с **kwork.com** и отправляет в Telegram самые интересные предложения.
|
||||
|
||||
## 🚀 Текущий статус
|
||||
|
||||
На данный момент реализован только модуль **веб-скраппинга**:
|
||||
|
||||
- сбор вакансий с Kwork
|
||||
- базовая обработка данных
|
||||
|
||||
## ✅ TODO
|
||||
|
||||
| Статус | Задача |
|
||||
| ------ | ------------------------------------------- |
|
||||
| ⏳ | Реализовать отправку уведомлений в Telegram |
|
||||
| ⏳ | Добавить фильтрацию по ключевым словам |
|
||||
| ⏳ | Интегрировать нейросеть для оценки вакансий |
|
||||
| ⏳ | Сделать возможность отклика на вакансии |
|
||||
| ⏳ | Реализовать авторизацию в Kwork |
|
||||
61
bot.py
Normal file
61
bot.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from os import getenv
|
||||
|
||||
from aiogram import Bot, Dispatcher, html
|
||||
from aiogram.client.default import DefaultBotProperties
|
||||
from aiogram.enums import ParseMode
|
||||
from aiogram.filters import CommandStart
|
||||
from aiogram.types import Message
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Bot token can be obtained via https://t.me/BotFather
|
||||
TOKEN = getenv("BOT_TOKEN")
|
||||
|
||||
# All handlers should be attached to the Router (or Dispatcher)
|
||||
|
||||
dp = Dispatcher()
|
||||
|
||||
|
||||
@dp.message(CommandStart())
|
||||
async def command_start_handler(message: Message) -> None:
|
||||
"""
|
||||
This handler receives messages with `/start` command
|
||||
"""
|
||||
# Most event objects have aliases for API methods that can be called in events' context
|
||||
# For example if you want to answer to incoming message you can use `message.answer(...)` alias
|
||||
# and the target chat will be passed to :ref:`aiogram.methods.send_message.SendMessage`
|
||||
# method automatically or call API method directly via
|
||||
# Bot instance: `bot.send_message(chat_id=message.chat.id, ...)`
|
||||
await message.answer(f"Hello, {html.bold(message.from_user.full_name)}!")
|
||||
|
||||
|
||||
@dp.message()
|
||||
async def echo_handler(message: Message) -> None:
|
||||
"""
|
||||
Handler will forward receive a message back to the sender
|
||||
|
||||
By default, message handler will handle all message types (like a text, photo, sticker etc.)
|
||||
"""
|
||||
try:
|
||||
# Send a copy of the received message
|
||||
await message.send_copy(chat_id=message.chat.id)
|
||||
except TypeError:
|
||||
# But not all the types is supported to be copied so need to handle it
|
||||
await message.answer("Nice try!")
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
# Initialize Bot instance with default bot properties which will be passed to all API calls
|
||||
bot = Bot(token=TOKEN, default=DefaultBotProperties(parse_mode=ParseMode.HTML))
|
||||
|
||||
# And the run events dispatching
|
||||
await dp.start_polling(bot)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
|
||||
asyncio.run(main())
|
||||
220
main.py
Normal file
220
main.py
Normal file
@@ -0,0 +1,220 @@
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from typing import Awaitable, Callable, Optional
|
||||
|
||||
from playwright.async_api import Locator, Page, async_playwright
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
BASE_URL = "https://kwork.ru"
|
||||
PROJECTS_URL = f"{BASE_URL}/projects?c=11"
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
VIEWPORT = {"width": 1920, "height": 1080}
|
||||
|
||||
Project = dict[str, str]
|
||||
Extractor = Callable[[Page, Locator], Awaitable[Optional[Project]]]
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def first_line(text: str) -> str:
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
return text.splitlines()[0].strip()
|
||||
|
||||
|
||||
def normalize_url(href: str) -> str:
|
||||
return f"{BASE_URL}{href}" if href.startswith("/") else href
|
||||
|
||||
|
||||
async def safe_inner_text(locator: Locator, default: str = "") -> str:
|
||||
try:
|
||||
text = await locator.inner_text(timeout=1500)
|
||||
text = text.replace("\xa0", " ")
|
||||
return text.strip()
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
async def first_text(root: Locator, selectors: list[str], default: str = "") -> str:
|
||||
for selector in selectors:
|
||||
try:
|
||||
loc = root.locator(selector).first
|
||||
text = await safe_inner_text(loc, "")
|
||||
text = normalize_text(text)
|
||||
if text:
|
||||
return text
|
||||
except Exception:
|
||||
continue
|
||||
return default
|
||||
|
||||
|
||||
async def get_card_root(page: Page, href: str) -> Locator:
|
||||
card = page.locator(
|
||||
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card__top")][1]'
|
||||
)
|
||||
|
||||
if await card.count() > 0:
|
||||
return card.first
|
||||
|
||||
card = page.locator(
|
||||
f'xpath=//a[@href="{href}"]/ancestor::div[contains(@class, "wants-card")][1]'
|
||||
)
|
||||
|
||||
if await card.count() > 0:
|
||||
return card.first
|
||||
|
||||
return page.locator(f'xpath=//a[@href="{href}"]/ancestor::div[1]').first
|
||||
|
||||
|
||||
async def extract_price(card: Locator) -> str:
|
||||
card_text = await safe_inner_text(card, "")
|
||||
card_text = normalize_text(card_text)
|
||||
|
||||
patterns = [
|
||||
r"(Желаемый бюджет:\s*до\s*[\d\s]+₽)",
|
||||
r"(Цена до:\s*[\d\s]+₽)",
|
||||
r"(Допустимый:\s*до\s*[\d\s]+₽)",
|
||||
]
|
||||
|
||||
found: list[str] = []
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, card_text, flags=re.IGNORECASE)
|
||||
if match:
|
||||
value = normalize_text(match.group(1))
|
||||
if value and value not in found:
|
||||
found.append(value)
|
||||
|
||||
if found:
|
||||
return " | ".join(found)
|
||||
|
||||
primary = await first_text(
|
||||
card,
|
||||
[
|
||||
".wants-card__price",
|
||||
".wants-card__header-right-block .wants-card__price",
|
||||
"[class*='wants-card__price']",
|
||||
"[class*='price']",
|
||||
],
|
||||
"",
|
||||
)
|
||||
|
||||
higher = await first_text(
|
||||
card,
|
||||
[
|
||||
".wants-card__description-higher-price",
|
||||
"[class*='description-higher-price']",
|
||||
],
|
||||
"",
|
||||
)
|
||||
|
||||
parts = [part for part in [primary, higher] if part]
|
||||
if parts:
|
||||
return " | ".join(parts)
|
||||
|
||||
return "По договоренности"
|
||||
|
||||
|
||||
async def extract_description(card: Locator) -> str:
|
||||
description = await first_text(
|
||||
card,
|
||||
[
|
||||
".wants-card__description-text .overflow-hidden .d-inline",
|
||||
".wants-card__description-text .overflow-hidden",
|
||||
".wants-card__description-text",
|
||||
"[class*='description-text']",
|
||||
],
|
||||
"",
|
||||
)
|
||||
|
||||
description = description.replace("Показать полностью", "")
|
||||
description = description.replace("Скрыть", "")
|
||||
description = description.replace("\xa0", " ")
|
||||
description = normalize_text(description)
|
||||
|
||||
return first_line(description)
|
||||
|
||||
|
||||
async def scrape_items(
|
||||
*,
|
||||
url: str,
|
||||
item_selector: str,
|
||||
extractor: Extractor,
|
||||
wait_until: str = "networkidle",
|
||||
render_delay: float = 3.0,
|
||||
) -> list[Project]:
|
||||
async with Stealth().use_async(async_playwright()) as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
try:
|
||||
context = await browser.new_context(
|
||||
user_agent=USER_AGENT,
|
||||
viewport=VIEWPORT,
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
await page.goto(url, wait_until=wait_until)
|
||||
await asyncio.sleep(render_delay)
|
||||
|
||||
result: list[Project] = []
|
||||
items = await page.locator(item_selector).all()
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
data = await extractor(page, item)
|
||||
if data is not None:
|
||||
result.append(data)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Ошибка: {e}")
|
||||
return []
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
|
||||
async def extract_kwork_project(page: Page, title_block: Locator) -> Optional[Project]:
|
||||
link = title_block.locator("a").first
|
||||
if await link.count() == 0:
|
||||
return None
|
||||
|
||||
title_text = normalize_text(await safe_inner_text(link, ""))
|
||||
href = await link.get_attribute("href")
|
||||
if not href:
|
||||
return None
|
||||
|
||||
card = await get_card_root(page, href)
|
||||
|
||||
price = await extract_price(card)
|
||||
description = await extract_description(card)
|
||||
|
||||
return {
|
||||
"title": title_text,
|
||||
"price": price,
|
||||
"url": normalize_url(href),
|
||||
"description": description,
|
||||
}
|
||||
|
||||
|
||||
async def get_kwork_projects() -> list[Project]:
|
||||
print("Загружаем проекты...")
|
||||
|
||||
return await scrape_items(
|
||||
url=PROJECTS_URL,
|
||||
item_selector=".wants-card__header-title",
|
||||
extractor=extract_kwork_project,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = asyncio.run(get_kwork_projects())
|
||||
print(json.dumps(data, ensure_ascii=False, indent=4))
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
aiogram==3.26.0
|
||||
playwright==1.58.0
|
||||
playwright_stealth==2.0.2
|
||||
python-dotenv==1.2.2
|
||||
Reference in New Issue
Block a user