Refined
This commit is contained in:
parent
185b5ce2f1
commit
fd4254df3e
9 changed files with 1332 additions and 4176 deletions
65
CLAUDE.md
Normal file
65
CLAUDE.md
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
# Job Scraper
|
||||
|
||||
Job board monitoring for privacy-focused and open-source companies.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Item | Value |
|
||||
|------|-------|
|
||||
| URL | https://jobs.novanexus.ca |
|
||||
| Port | 8085 |
|
||||
| Containers | job-scraper-scheduled, job-dashboard |
|
||||
| Data | ./data/jobs.db, ./data/dashboard.html |
|
||||
|
||||
## Development Workflow
|
||||
|
||||
After making code changes, rebuild and deploy:
|
||||
|
||||
```bash
|
||||
cd ~/job-scraper && sudo docker compose build && sudo docker compose up -d
|
||||
```
|
||||
|
||||
## Common Commands
|
||||
|
||||
```bash
|
||||
# View status
|
||||
sudo docker compose ps
|
||||
|
||||
# View logs
|
||||
sudo docker compose logs -f scraper-scheduled
|
||||
|
||||
# Run scraper manually (one-time)
|
||||
sudo docker compose run --rm scraper
|
||||
|
||||
# Restart services
|
||||
sudo docker compose restart
|
||||
|
||||
# Rebuild without cache (if having issues)
|
||||
sudo docker compose build --no-cache && sudo docker compose up -d
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
- `config.yaml` - Companies to monitor, filters, notifications
|
||||
- After config changes: `sudo docker compose restart scraper-scheduled`
|
||||
|
||||
## Email Notifications
|
||||
|
||||
Uses msmtp with system config (`~/.msmtprc`). The container mounts this file.
|
||||
|
||||
To test email manually:
|
||||
```bash
|
||||
sudo docker compose run --rm scraper
|
||||
```
|
||||
|
||||
## Files
|
||||
|
||||
- `main.py` - CLI entry point, scheduling
|
||||
- `db.py` - SQLite database operations
|
||||
- `notify.py` - Console, msmtp, email, Slack notifications
|
||||
- `dashboard.py` - HTML dashboard generator
|
||||
- `scrapers/` - Platform-specific scrapers (greenhouse, lever, ashby)
|
||||
|
||||
## Documentation
|
||||
|
||||
Full docs: ~/maple-docs/docs/04-services/job-scraper.md
|
||||
|
|
@ -2,6 +2,12 @@ FROM python:3.12-slim
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
# Install msmtp for email notifications
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
msmtp \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
|
|
|||
|
|
@ -154,7 +154,12 @@ notifications:
|
|||
# Console output is always enabled
|
||||
console: true
|
||||
|
||||
# Uncomment and configure for email notifications
|
||||
# msmtp - uses system msmtp config (~/.msmtprc)
|
||||
msmtp:
|
||||
from_addr: admin@novanexus.ca
|
||||
to_addr: me@bastiangruber.ca
|
||||
|
||||
# Uncomment and configure for SMTP email notifications (alternative to msmtp)
|
||||
# email:
|
||||
# smtp_host: smtp.gmail.com
|
||||
# smtp_port: 587
|
||||
|
|
|
|||
36
dashboard.py
36
dashboard.py
|
|
@ -133,6 +133,14 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
db = Database()
|
||||
jobs = db.get_all_active_jobs()
|
||||
|
||||
# Get all monitored companies
|
||||
all_company_names = db.get_all_companies()
|
||||
|
||||
# Track total jobs per company (before location filtering)
|
||||
total_per_company = {}
|
||||
for company_name, job in jobs:
|
||||
total_per_company[company_name] = total_per_company.get(company_name, 0) + 1
|
||||
|
||||
# Group by company, filtering out irrelevant remote locations
|
||||
companies = {}
|
||||
filtered_count = 0
|
||||
|
|
@ -144,7 +152,15 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
companies[company_name] = []
|
||||
companies[company_name].append(job)
|
||||
|
||||
# Ensure all monitored companies are in the dict (even with 0 jobs)
|
||||
for name in all_company_names:
|
||||
if name not in companies:
|
||||
companies[name] = []
|
||||
if name not in total_per_company:
|
||||
total_per_company[name] = 0
|
||||
|
||||
total_shown = sum(len(jobs) for jobs in companies.values())
|
||||
total_scraped = sum(total_per_company.values())
|
||||
|
||||
# Sort companies by name
|
||||
sorted_companies = sorted(companies.items())
|
||||
|
|
@ -334,6 +350,13 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
.toc-links a:hover {{
|
||||
text-decoration: underline;
|
||||
}}
|
||||
.toc-links .empty {{
|
||||
color: var(--muted);
|
||||
cursor: default;
|
||||
}}
|
||||
.toc-links .empty:hover {{
|
||||
text-decoration: none;
|
||||
}}
|
||||
.filter-buttons {{
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
|
|
@ -367,7 +390,7 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
<h1>$ job-board</h1>
|
||||
<div class="meta">
|
||||
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
|
||||
{total_shown} jobs across {len(companies)} companies
|
||||
{total_shown}/{total_scraped} jobs (location filtered) | Monitoring {len(all_company_names)} companies
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
|
@ -402,7 +425,12 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
# Table of contents
|
||||
for company_name, company_jobs in sorted_companies:
|
||||
anchor = company_name.lower().replace(" ", "-")
|
||||
html += f' <a href="#{anchor}">{company_name} ({len(company_jobs)})</a>\n'
|
||||
filtered = len(company_jobs)
|
||||
total = total_per_company.get(company_name, 0)
|
||||
if filtered > 0:
|
||||
html += f' <a href="#{anchor}">{company_name} ({filtered}/{total})</a>\n'
|
||||
else:
|
||||
html += f' <span class="empty">{company_name} (0/{total})</span>\n'
|
||||
|
||||
html += """ </div>
|
||||
</div>
|
||||
|
|
@ -410,8 +438,10 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
|
|||
<main id="job-list">
|
||||
"""
|
||||
|
||||
# Job listings
|
||||
# Job listings (only for companies with jobs)
|
||||
for company_name, company_jobs in sorted_companies:
|
||||
if not company_jobs:
|
||||
continue # Skip companies with no jobs after filtering
|
||||
anchor = company_name.lower().replace(" ", "-")
|
||||
html += f"""
|
||||
<div class="company" id="{anchor}">
|
||||
|
|
|
|||
5297
data/dashboard.html
5297
data/dashboard.html
File diff suppressed because it is too large
Load diff
Binary file not shown.
13
db.py
13
db.py
|
|
@ -35,8 +35,11 @@ class Database:
|
|||
@contextmanager
|
||||
def _get_conn(self):
|
||||
"""Get a database connection."""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn = sqlite3.connect(self.db_path, timeout=30.0)
|
||||
conn.row_factory = sqlite3.Row
|
||||
# Enable WAL mode for better concurrency
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA busy_timeout=30000")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
|
|
@ -236,3 +239,11 @@ class Database:
|
|||
)
|
||||
results.append((row["company_name"], job))
|
||||
return results
|
||||
|
||||
def get_all_companies(self) -> list[str]:
|
||||
"""Get all company names from the database."""
|
||||
with self._get_conn() as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM companies WHERE active = TRUE ORDER BY name"
|
||||
)
|
||||
return [row["name"] for row in cursor.fetchall()]
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ services:
|
|||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
|
||||
environment:
|
||||
- TZ=America/Toronto
|
||||
|
||||
|
|
@ -16,6 +17,7 @@ services:
|
|||
volumes:
|
||||
- ./data:/app/data
|
||||
- ./config.yaml:/app/config.yaml:ro
|
||||
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
|
||||
environment:
|
||||
- TZ=America/Toronto
|
||||
command: ["python", "main.py", "--schedule"]
|
||||
|
|
|
|||
82
notify.py
82
notify.py
|
|
@ -38,6 +38,11 @@ class Notifier:
|
|||
if email_config:
|
||||
self._notify_email(reports_with_changes, email_config)
|
||||
|
||||
# msmtp (if configured - uses system msmtp config)
|
||||
msmtp_config = self.config.get("msmtp")
|
||||
if msmtp_config:
|
||||
self._notify_msmtp(reports_with_changes, msmtp_config)
|
||||
|
||||
# Slack (if configured)
|
||||
slack_config = self.config.get("slack")
|
||||
if slack_config:
|
||||
|
|
@ -98,6 +103,83 @@ class Notifier:
|
|||
except Exception as e:
|
||||
print(f"✗ Failed to send email: {e}")
|
||||
|
||||
def _notify_msmtp(self, reports: list[ChangeReport], config: dict):
|
||||
"""Send email notification via system msmtp."""
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
|
||||
to_addr = config.get("to_addr", "me@bastiangruber.ca")
|
||||
from_addr = config.get("from_addr", "admin@novanexus.ca")
|
||||
|
||||
total_new = sum(len(r.new_jobs) for r in reports)
|
||||
total_removed = sum(len(r.removed_jobs) for r in reports)
|
||||
|
||||
# Build subject line
|
||||
parts = []
|
||||
if total_new:
|
||||
parts.append(f"+{total_new} new")
|
||||
if total_removed:
|
||||
parts.append(f"-{total_removed} removed")
|
||||
subject = f"Job Board Update: {', '.join(parts)}"
|
||||
|
||||
# Build plain text body
|
||||
body_lines = [
|
||||
"JOB BOARD CHANGES",
|
||||
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
||||
"",
|
||||
f"Summary: {total_new} new jobs, {total_removed} removed jobs",
|
||||
"",
|
||||
]
|
||||
|
||||
for report in reports:
|
||||
body_lines.append(f"{report.company_name} ({report.total_active} active)")
|
||||
body_lines.append("-" * 40)
|
||||
|
||||
if report.new_jobs:
|
||||
body_lines.append(f" NEW ({len(report.new_jobs)}):")
|
||||
for job in report.new_jobs:
|
||||
location_str = f" [{job.location}]" if job.location else ""
|
||||
remote_str = " (Remote)" if job.remote_type == "remote" else ""
|
||||
body_lines.append(f" + {job.title}{location_str}{remote_str}")
|
||||
body_lines.append(f" {job.url}")
|
||||
|
||||
if report.removed_jobs:
|
||||
body_lines.append(f" REMOVED ({len(report.removed_jobs)}):")
|
||||
for job in report.removed_jobs:
|
||||
body_lines.append(f" - {job.title}")
|
||||
|
||||
body_lines.append("")
|
||||
|
||||
body_lines.append("---")
|
||||
body_lines.append("Generated by job-scraper")
|
||||
|
||||
body = "\n".join(body_lines)
|
||||
|
||||
# Build email message
|
||||
email_msg = f"""Subject: {subject}
|
||||
From: {from_addr}
|
||||
To: {to_addr}
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
{body}
|
||||
"""
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["msmtp", to_addr],
|
||||
input=email_msg,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
print("✓ msmtp notification sent")
|
||||
else:
|
||||
print(f"✗ msmtp failed: {result.stderr}")
|
||||
except FileNotFoundError:
|
||||
print("✗ msmtp not found - install with: apt install msmtp")
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to send msmtp notification: {e}")
|
||||
|
||||
def _notify_slack(self, reports: list[ChangeReport], config: dict):
|
||||
"""Send Slack notification."""
|
||||
import httpx
|
||||
|
|
|
|||
Loading…
Reference in a new issue