247 lines
7.4 KiB
Python
247 lines
7.4 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Job Scraper - Monitor job openings from companies you're interested in.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python main.py # Run once
|
||
|
|
python main.py --schedule # Run daily at configured time
|
||
|
|
python main.py --list # List all tracked jobs
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import yaml
|
||
|
|
|
||
|
|
from db import Database
|
||
|
|
from notify import ChangeReport, Notifier
|
||
|
|
from scrapers import AshbyScraper, GreenhouseScraper, LeverScraper
|
||
|
|
from scrapers.base import BaseScraper, Job
|
||
|
|
from dashboard import generate_dashboard
|
||
|
|
|
||
|
|
|
||
|
|
def load_config(config_path: str = "config.yaml") -> dict:
|
||
|
|
"""Load configuration from YAML file."""
|
||
|
|
with open(config_path) as f:
|
||
|
|
return yaml.safe_load(f)
|
||
|
|
|
||
|
|
|
||
|
|
def get_scraper(company_config: dict) -> BaseScraper:
|
||
|
|
"""Create a scraper instance based on company configuration."""
|
||
|
|
platform = company_config["platform"]
|
||
|
|
name = company_config["name"]
|
||
|
|
|
||
|
|
if platform == "greenhouse":
|
||
|
|
return GreenhouseScraper(name, company_config["board_token"])
|
||
|
|
elif platform == "lever":
|
||
|
|
return LeverScraper(name, company_config["lever_company"])
|
||
|
|
elif platform == "ashby":
|
||
|
|
return AshbyScraper(name, company_config["ashby_company"])
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unknown platform: {platform}")
|
||
|
|
|
||
|
|
|
||
|
|
def filter_jobs_by_title(jobs: list[Job], title_filters: list[str]) -> list[Job]:
|
||
|
|
"""Filter jobs to only include those matching title keywords."""
|
||
|
|
if not title_filters:
|
||
|
|
return jobs
|
||
|
|
|
||
|
|
filtered = []
|
||
|
|
for job in jobs:
|
||
|
|
title_lower = job.title.lower()
|
||
|
|
if any(keyword.lower() in title_lower for keyword in title_filters):
|
||
|
|
filtered.append(job)
|
||
|
|
return filtered
|
||
|
|
|
||
|
|
|
||
|
|
def scrape_company(company_config: dict, db: Database, config: dict) -> ChangeReport:
|
||
|
|
"""Scrape jobs for a single company and detect changes."""
|
||
|
|
name = company_config["name"]
|
||
|
|
print(f"\n🔍 Scraping {name}...", end=" ", flush=True)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with get_scraper(company_config) as scraper:
|
||
|
|
# Get current jobs from the career page
|
||
|
|
all_jobs = scraper.scrape()
|
||
|
|
|
||
|
|
# Filter by title keywords if configured
|
||
|
|
title_filters = config.get("title_filters", [])
|
||
|
|
current_jobs = filter_jobs_by_title(all_jobs, title_filters)
|
||
|
|
|
||
|
|
print(f"found {len(current_jobs)} jobs (of {len(all_jobs)} total)")
|
||
|
|
|
||
|
|
# Get or create company in database
|
||
|
|
company_id = db.get_or_create_company(
|
||
|
|
name,
|
||
|
|
jobs_url=company_config.get("board_token", company_config.get("lever_company", "")),
|
||
|
|
platform_type=company_config["platform"]
|
||
|
|
)
|
||
|
|
|
||
|
|
# Get stored jobs
|
||
|
|
stored_jobs = db.get_active_jobs(company_id)
|
||
|
|
|
||
|
|
# Detect changes
|
||
|
|
current_ids = {job.external_id for job in current_jobs}
|
||
|
|
stored_ids = set(stored_jobs.keys())
|
||
|
|
|
||
|
|
new_ids = current_ids - stored_ids
|
||
|
|
removed_ids = stored_ids - current_ids
|
||
|
|
|
||
|
|
# Process new jobs
|
||
|
|
new_jobs = []
|
||
|
|
for job in current_jobs:
|
||
|
|
is_new, _ = db.upsert_job(company_id, job)
|
||
|
|
if is_new:
|
||
|
|
new_jobs.append(job)
|
||
|
|
|
||
|
|
# Mark removed jobs
|
||
|
|
removed_jobs = db.mark_jobs_removed(company_id, removed_ids)
|
||
|
|
|
||
|
|
# Update last scraped time
|
||
|
|
db.update_company_scraped(company_id)
|
||
|
|
|
||
|
|
# Apply location filters to highlight relevant jobs
|
||
|
|
location_filters = config.get("location_filters", [])
|
||
|
|
if location_filters and new_jobs:
|
||
|
|
relevant_new = []
|
||
|
|
for job in new_jobs:
|
||
|
|
if job.location:
|
||
|
|
loc_lower = job.location.lower()
|
||
|
|
if any(f.lower() in loc_lower for f in location_filters):
|
||
|
|
relevant_new.append(job)
|
||
|
|
elif job.remote_type == "remote":
|
||
|
|
relevant_new.append(job)
|
||
|
|
|
||
|
|
if relevant_new:
|
||
|
|
print(f" ⭐ {len(relevant_new)} jobs match your location filters!")
|
||
|
|
|
||
|
|
return ChangeReport(
|
||
|
|
company_name=name,
|
||
|
|
new_jobs=new_jobs,
|
||
|
|
removed_jobs=removed_jobs,
|
||
|
|
total_active=len(current_jobs)
|
||
|
|
)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"ERROR: {e}")
|
||
|
|
return ChangeReport(
|
||
|
|
company_name=name,
|
||
|
|
new_jobs=[],
|
||
|
|
removed_jobs=[],
|
||
|
|
total_active=0
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def run_scraper(config: dict):
|
||
|
|
"""Run the scraper for all configured companies."""
|
||
|
|
print(f"\n{'=' * 60}")
|
||
|
|
print(f"Job Scraper - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||
|
|
print(f"{'=' * 60}")
|
||
|
|
|
||
|
|
db = Database()
|
||
|
|
notifier = Notifier(config.get("notifications", {}))
|
||
|
|
|
||
|
|
companies = config.get("companies", [])
|
||
|
|
print(f"\nMonitoring {len(companies)} companies...")
|
||
|
|
|
||
|
|
reports = []
|
||
|
|
delay = config.get("scraper", {}).get("request_delay", 2)
|
||
|
|
|
||
|
|
for i, company_config in enumerate(companies):
|
||
|
|
report = scrape_company(company_config, db, config)
|
||
|
|
reports.append(report)
|
||
|
|
|
||
|
|
# Delay between companies (be respectful!)
|
||
|
|
if i < len(companies) - 1:
|
||
|
|
time.sleep(delay)
|
||
|
|
|
||
|
|
# Send notifications
|
||
|
|
notifier.notify(reports)
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
total_jobs = sum(r.total_active for r in reports)
|
||
|
|
total_new = sum(len(r.new_jobs) for r in reports)
|
||
|
|
total_removed = sum(len(r.removed_jobs) for r in reports)
|
||
|
|
|
||
|
|
print(f"\n📊 Total: {total_jobs} active jobs across {len(companies)} companies")
|
||
|
|
print(f" Changes: +{total_new} new, -{total_removed} removed")
|
||
|
|
|
||
|
|
# Generate dashboard
|
||
|
|
generate_dashboard()
|
||
|
|
|
||
|
|
|
||
|
|
def list_jobs(config: dict):
|
||
|
|
"""List all tracked jobs."""
|
||
|
|
db = Database()
|
||
|
|
jobs = db.get_all_active_jobs()
|
||
|
|
|
||
|
|
if not jobs:
|
||
|
|
print("No jobs tracked yet. Run the scraper first.")
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f"\n{'=' * 60}")
|
||
|
|
print(f"All Tracked Jobs ({len(jobs)} total)")
|
||
|
|
print(f"{'=' * 60}")
|
||
|
|
|
||
|
|
current_company = None
|
||
|
|
for company_name, job in jobs:
|
||
|
|
if company_name != current_company:
|
||
|
|
print(f"\n📌 {company_name}")
|
||
|
|
print("-" * 40)
|
||
|
|
current_company = company_name
|
||
|
|
|
||
|
|
location = f" [{job.location}]" if job.location else ""
|
||
|
|
remote = " 🏠" if job.remote_type == "remote" else ""
|
||
|
|
print(f" • {job.title}{location}{remote}")
|
||
|
|
print(f" {job.url}")
|
||
|
|
|
||
|
|
|
||
|
|
def run_scheduled(config: dict):
|
||
|
|
"""Run the scraper on a schedule."""
|
||
|
|
import schedule
|
||
|
|
|
||
|
|
print("Starting scheduled job scraper...")
|
||
|
|
print("Will run daily at 09:00")
|
||
|
|
print("Press Ctrl+C to stop\n")
|
||
|
|
|
||
|
|
# Run immediately on start
|
||
|
|
run_scraper(config)
|
||
|
|
|
||
|
|
# Schedule daily run
|
||
|
|
schedule.every().day.at("09:00").do(run_scraper, config)
|
||
|
|
|
||
|
|
while True:
|
||
|
|
schedule.run_pending()
|
||
|
|
time.sleep(60)
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Job Scraper - Monitor job openings")
|
||
|
|
parser.add_argument("--config", default="config.yaml", help="Path to config file")
|
||
|
|
parser.add_argument("--schedule", action="store_true", help="Run on a schedule")
|
||
|
|
parser.add_argument("--list", action="store_true", help="List all tracked jobs")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Load config
|
||
|
|
config_path = Path(args.config)
|
||
|
|
if not config_path.exists():
|
||
|
|
print(f"Error: Config file not found: {config_path}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
config = load_config(args.config)
|
||
|
|
|
||
|
|
if args.list:
|
||
|
|
list_jobs(config)
|
||
|
|
elif args.schedule:
|
||
|
|
run_scheduled(config)
|
||
|
|
else:
|
||
|
|
run_scraper(config)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|