This commit is contained in:
Bastian Gruber 2026-01-20 18:27:17 +00:00
parent 185b5ce2f1
commit fd4254df3e
Signed by: gruberb
GPG key ID: 426AF1CBA0530691
9 changed files with 1332 additions and 4176 deletions

65
CLAUDE.md Normal file
View file

@ -0,0 +1,65 @@
# Job Scraper
Job board monitoring for privacy-focused and open-source companies.
## Quick Reference
| Item | Value |
|------|-------|
| URL | https://jobs.novanexus.ca |
| Port | 8085 |
| Containers | job-scraper-scheduled, job-dashboard |
| Data | ./data/jobs.db, ./data/dashboard.html |
## Development Workflow
After making code changes, rebuild and deploy:
```bash
cd ~/job-scraper && sudo docker compose build && sudo docker compose up -d
```
## Common Commands
```bash
# View status
sudo docker compose ps
# View logs
sudo docker compose logs -f scraper-scheduled
# Run scraper manually (one-time)
sudo docker compose run --rm scraper
# Restart services
sudo docker compose restart
# Rebuild without cache (if having issues)
sudo docker compose build --no-cache && sudo docker compose up -d
```
## Configuration
- `config.yaml` - Companies to monitor, filters, notifications
- After config changes: `sudo docker compose restart scraper-scheduled`
## Email Notifications
Uses msmtp with system config (`~/.msmtprc`). The container mounts this file.
To test email manually:
```bash
sudo docker compose run --rm scraper
```
## Files
- `main.py` - CLI entry point, scheduling
- `db.py` - SQLite database operations
- `notify.py` - Console, msmtp, email, Slack notifications
- `dashboard.py` - HTML dashboard generator
- `scrapers/` - Platform-specific scrapers (greenhouse, lever, ashby)
## Documentation
Full docs: ~/maple-docs/docs/04-services/job-scraper.md

View file

@ -2,6 +2,12 @@ FROM python:3.12-slim
WORKDIR /app WORKDIR /app
# Install msmtp for email notifications
RUN apt-get update && apt-get install -y --no-install-recommends \
msmtp \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Install dependencies # Install dependencies
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt

View file

@ -154,7 +154,12 @@ notifications:
# Console output is always enabled # Console output is always enabled
console: true console: true
# Uncomment and configure for email notifications # msmtp - uses system msmtp config (~/.msmtprc)
msmtp:
from_addr: admin@novanexus.ca
to_addr: me@bastiangruber.ca
# Uncomment and configure for SMTP email notifications (alternative to msmtp)
# email: # email:
# smtp_host: smtp.gmail.com # smtp_host: smtp.gmail.com
# smtp_port: 587 # smtp_port: 587

View file

@ -133,6 +133,14 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
db = Database() db = Database()
jobs = db.get_all_active_jobs() jobs = db.get_all_active_jobs()
# Get all monitored companies
all_company_names = db.get_all_companies()
# Track total jobs per company (before location filtering)
total_per_company = {}
for company_name, job in jobs:
total_per_company[company_name] = total_per_company.get(company_name, 0) + 1
# Group by company, filtering out irrelevant remote locations # Group by company, filtering out irrelevant remote locations
companies = {} companies = {}
filtered_count = 0 filtered_count = 0
@ -144,7 +152,15 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
companies[company_name] = [] companies[company_name] = []
companies[company_name].append(job) companies[company_name].append(job)
# Ensure all monitored companies are in the dict (even with 0 jobs)
for name in all_company_names:
if name not in companies:
companies[name] = []
if name not in total_per_company:
total_per_company[name] = 0
total_shown = sum(len(jobs) for jobs in companies.values()) total_shown = sum(len(jobs) for jobs in companies.values())
total_scraped = sum(total_per_company.values())
# Sort companies by name # Sort companies by name
sorted_companies = sorted(companies.items()) sorted_companies = sorted(companies.items())
@ -334,6 +350,13 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
.toc-links a:hover {{ .toc-links a:hover {{
text-decoration: underline; text-decoration: underline;
}} }}
.toc-links .empty {{
color: var(--muted);
cursor: default;
}}
.toc-links .empty:hover {{
text-decoration: none;
}}
.filter-buttons {{ .filter-buttons {{
display: flex; display: flex;
flex-wrap: wrap; flex-wrap: wrap;
@ -367,7 +390,7 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
<h1>$ job-board</h1> <h1>$ job-board</h1>
<div class="meta"> <div class="meta">
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} | Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
{total_shown} jobs across {len(companies)} companies {total_shown}/{total_scraped} jobs (location filtered) | Monitoring {len(all_company_names)} companies
</div> </div>
</header> </header>
@ -402,7 +425,12 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
# Table of contents # Table of contents
for company_name, company_jobs in sorted_companies: for company_name, company_jobs in sorted_companies:
anchor = company_name.lower().replace(" ", "-") anchor = company_name.lower().replace(" ", "-")
html += f' <a href="#{anchor}">{company_name} ({len(company_jobs)})</a>\n' filtered = len(company_jobs)
total = total_per_company.get(company_name, 0)
if filtered > 0:
html += f' <a href="#{anchor}">{company_name} ({filtered}/{total})</a>\n'
else:
html += f' <span class="empty">{company_name} (0/{total})</span>\n'
html += """ </div> html += """ </div>
</div> </div>
@ -410,8 +438,10 @@ def generate_dashboard(output_path: str = "data/dashboard.html"):
<main id="job-list"> <main id="job-list">
""" """
# Job listings # Job listings (only for companies with jobs)
for company_name, company_jobs in sorted_companies: for company_name, company_jobs in sorted_companies:
if not company_jobs:
continue # Skip companies with no jobs after filtering
anchor = company_name.lower().replace(" ", "-") anchor = company_name.lower().replace(" ", "-")
html += f""" html += f"""
<div class="company" id="{anchor}"> <div class="company" id="{anchor}">

File diff suppressed because it is too large Load diff

Binary file not shown.

13
db.py
View file

@ -35,8 +35,11 @@ class Database:
@contextmanager @contextmanager
def _get_conn(self): def _get_conn(self):
"""Get a database connection.""" """Get a database connection."""
conn = sqlite3.connect(self.db_path) conn = sqlite3.connect(self.db_path, timeout=30.0)
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
# Enable WAL mode for better concurrency
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA busy_timeout=30000")
try: try:
yield conn yield conn
conn.commit() conn.commit()
@ -236,3 +239,11 @@ class Database:
) )
results.append((row["company_name"], job)) results.append((row["company_name"], job))
return results return results
def get_all_companies(self) -> list[str]:
"""Get all company names from the database."""
with self._get_conn() as conn:
cursor = conn.execute(
"SELECT name FROM companies WHERE active = TRUE ORDER BY name"
)
return [row["name"] for row in cursor.fetchall()]

View file

@ -6,6 +6,7 @@ services:
volumes: volumes:
- ./data:/app/data - ./data:/app/data
- ./config.yaml:/app/config.yaml:ro - ./config.yaml:/app/config.yaml:ro
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
environment: environment:
- TZ=America/Toronto - TZ=America/Toronto
@ -16,6 +17,7 @@ services:
volumes: volumes:
- ./data:/app/data - ./data:/app/data
- ./config.yaml:/app/config.yaml:ro - ./config.yaml:/app/config.yaml:ro
- /home/gruberb/.msmtprc:/root/.msmtprc:ro
environment: environment:
- TZ=America/Toronto - TZ=America/Toronto
command: ["python", "main.py", "--schedule"] command: ["python", "main.py", "--schedule"]

View file

@ -38,6 +38,11 @@ class Notifier:
if email_config: if email_config:
self._notify_email(reports_with_changes, email_config) self._notify_email(reports_with_changes, email_config)
# msmtp (if configured - uses system msmtp config)
msmtp_config = self.config.get("msmtp")
if msmtp_config:
self._notify_msmtp(reports_with_changes, msmtp_config)
# Slack (if configured) # Slack (if configured)
slack_config = self.config.get("slack") slack_config = self.config.get("slack")
if slack_config: if slack_config:
@ -98,6 +103,83 @@ class Notifier:
except Exception as e: except Exception as e:
print(f"✗ Failed to send email: {e}") print(f"✗ Failed to send email: {e}")
def _notify_msmtp(self, reports: list[ChangeReport], config: dict):
"""Send email notification via system msmtp."""
import subprocess
from datetime import datetime
to_addr = config.get("to_addr", "me@bastiangruber.ca")
from_addr = config.get("from_addr", "admin@novanexus.ca")
total_new = sum(len(r.new_jobs) for r in reports)
total_removed = sum(len(r.removed_jobs) for r in reports)
# Build subject line
parts = []
if total_new:
parts.append(f"+{total_new} new")
if total_removed:
parts.append(f"-{total_removed} removed")
subject = f"Job Board Update: {', '.join(parts)}"
# Build plain text body
body_lines = [
"JOB BOARD CHANGES",
f"{datetime.now().strftime('%Y-%m-%d %H:%M')}",
"",
f"Summary: {total_new} new jobs, {total_removed} removed jobs",
"",
]
for report in reports:
body_lines.append(f"{report.company_name} ({report.total_active} active)")
body_lines.append("-" * 40)
if report.new_jobs:
body_lines.append(f" NEW ({len(report.new_jobs)}):")
for job in report.new_jobs:
location_str = f" [{job.location}]" if job.location else ""
remote_str = " (Remote)" if job.remote_type == "remote" else ""
body_lines.append(f" + {job.title}{location_str}{remote_str}")
body_lines.append(f" {job.url}")
if report.removed_jobs:
body_lines.append(f" REMOVED ({len(report.removed_jobs)}):")
for job in report.removed_jobs:
body_lines.append(f" - {job.title}")
body_lines.append("")
body_lines.append("---")
body_lines.append("Generated by job-scraper")
body = "\n".join(body_lines)
# Build email message
email_msg = f"""Subject: {subject}
From: {from_addr}
To: {to_addr}
Content-Type: text/plain; charset=UTF-8
{body}
"""
try:
result = subprocess.run(
["msmtp", to_addr],
input=email_msg,
capture_output=True,
text=True,
)
if result.returncode == 0:
print("✓ msmtp notification sent")
else:
print(f"✗ msmtp failed: {result.stderr}")
except FileNotFoundError:
print("✗ msmtp not found - install with: apt install msmtp")
except Exception as e:
print(f"✗ Failed to send msmtp notification: {e}")
def _notify_slack(self, reports: list[ChangeReport], config: dict): def _notify_slack(self, reports: list[ChangeReport], config: dict):
"""Send Slack notification.""" """Send Slack notification."""
import httpx import httpx