job-scraper/dashboard.py
2026-01-20 18:27:17 +00:00

566 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Generate a simple text-based HTML dashboard of all tracked jobs.
"""
import re
from datetime import datetime
from pathlib import Path
from db import Database
# Regions/locations we care about (case-insensitive matching)
DESIRED_REGIONS = [
"canada", "toronto", "vancouver",
"germany", "berlin", "munich",
"emea",
"americas", # includes North/South America
"north america",
"worldwide", "global", "anywhere",
]
# Locations to explicitly exclude (on-site or remote restricted to these)
EXCLUDED_LOCATIONS = [
# US cities/states (we don't want US-only jobs)
"san francisco", "new york", "nyc", "seattle", "austin", "boston",
"chicago", "denver", "los angeles", "atlanta", "dallas", "houston",
"california", "washington", "texas", "massachusetts", "colorado",
"united states", "usa", "u.s.", "us-", "usa-",
# UK
"london", "united kingdom", "uk", "dublin", "ireland",
# Australia/APAC (not EMEA)
"sydney", "melbourne", "australia", "singapore", "tokyo", "japan",
"india", "bangalore", "bengaluru", "hyderabad", "delhi",
"korea", "seoul", "taiwan", "taipei", "china", "beijing", "shenzhen",
# Other excluded
"israel", "tel aviv", "brazil", "sao paulo", "mexico",
"netherlands", "amsterdam", "france", "paris", "spain", "madrid",
"portugal", "lisbon", "poland", "warsaw", "italy",
"czech", "prague", "serbia", "belgrade", "cyprus", "limassol",
"austria", "vienna", "sweden", "stockholm", "denmark", "copenhagen",
"switzerland", "romania", "bucharest", "hungary", "greece",
"south africa", "indonesia", "jakarta", "malaysia",
]
def is_location_relevant(location: str, remote_type: str) -> bool:
"""
Strict location filter. Only keeps jobs available in Canada, Germany, EMEA, or Worldwide.
Filters out US-only jobs, UK jobs, APAC jobs, etc.
"""
if not location:
return False # No location info = probably US-based, filter out
loc_lower = location.lower()
# Check if any desired region is mentioned FIRST
has_desired = any(region in loc_lower for region in DESIRED_REGIONS)
# If it has a desired region, keep it (even if it also mentions excluded locations)
# e.g., "Remote (United States | Canada)" should be kept because of Canada
if has_desired:
return True
# If it just says "Remote" with nothing else, keep it (truly remote)
if loc_lower.strip() == "remote":
return True
# Check for excluded locations
has_excluded = any(excl in loc_lower for excl in EXCLUDED_LOCATIONS)
if has_excluded:
return False
# Check for patterns like "In-Office", "Hybrid", "On-site" without desired region
if any(x in loc_lower for x in ["in-office", "hybrid", "on-site", "onsite", "office based"]):
return False
# If we can't determine, filter it out (safer)
return False
def extract_location_tags(location: str, remote_type: str) -> tuple[list[str], str]:
"""
Extract relevant location tags and a short display location.
Returns (list of tag names, short location string)
"""
if not location:
return [], ""
loc_lower = location.lower()
tags = []
short_loc = ""
# Check for remote
is_remote = remote_type == "remote" or "remote" in loc_lower
if is_remote:
tags.append("remote")
# Check for Canada
if any(x in loc_lower for x in ["canada", "toronto", "vancouver"]):
tags.append("canada")
short_loc = "Canada"
# Check for Germany/Berlin
if any(x in loc_lower for x in ["germany", "berlin", "munich"]):
tags.append("germany")
short_loc = "Germany" if "germany" in loc_lower else "Berlin"
# Check for EMEA
if "emea" in loc_lower:
tags.append("emea")
short_loc = "EMEA"
# Check for Americas/North America
if "americas" in loc_lower or "north america" in loc_lower:
tags.append("americas")
short_loc = "Americas"
# Check for Worldwide
if any(x in loc_lower for x in ["worldwide", "global", "anywhere"]):
tags.append("worldwide")
short_loc = "Worldwide"
# If no specific region found but it's remote
if not short_loc and is_remote:
short_loc = "Remote"
return tags, short_loc
def generate_dashboard(output_path: str = "data/dashboard.html"):
"""Generate a static HTML dashboard."""
db = Database()
jobs = db.get_all_active_jobs()
# Get all monitored companies
all_company_names = db.get_all_companies()
# Track total jobs per company (before location filtering)
total_per_company = {}
for company_name, job in jobs:
total_per_company[company_name] = total_per_company.get(company_name, 0) + 1
# Group by company, filtering out irrelevant remote locations
companies = {}
filtered_count = 0
for company_name, job in jobs:
if not is_location_relevant(job.location, job.remote_type):
filtered_count += 1
continue
if company_name not in companies:
companies[company_name] = []
companies[company_name].append(job)
# Ensure all monitored companies are in the dict (even with 0 jobs)
for name in all_company_names:
if name not in companies:
companies[name] = []
if name not in total_per_company:
total_per_company[name] = 0
total_shown = sum(len(jobs) for jobs in companies.values())
total_scraped = sum(total_per_company.values())
# Sort companies by name
sorted_companies = sorted(companies.items())
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Job Board</title>
<style>
:root {{
--bg: #1a1a1a;
--fg: #e0e0e0;
--accent: #4a9eff;
--muted: #888;
--border: #333;
--highlight: #2a2a2a;
}}
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
body {{
font-family: "SF Mono", "Monaco", "Inconsolata", "Fira Code", monospace;
font-size: 14px;
line-height: 1.6;
background: var(--bg);
color: var(--fg);
padding: 20px;
max-width: 1200px;
margin: 0 auto;
}}
header {{
border-bottom: 1px solid var(--border);
padding-bottom: 15px;
margin-bottom: 20px;
}}
h1 {{
font-size: 18px;
font-weight: normal;
color: var(--accent);
}}
.meta {{
color: var(--muted);
font-size: 12px;
margin-top: 5px;
}}
.filters {{
margin: 15px 0;
padding: 10px;
background: var(--highlight);
border-radius: 4px;
}}
.filters input {{
background: var(--bg);
border: 1px solid var(--border);
color: var(--fg);
padding: 8px 12px;
width: 100%;
max-width: 400px;
font-family: inherit;
font-size: 14px;
border-radius: 4px;
}}
.filters input:focus {{
outline: none;
border-color: var(--accent);
}}
.stats {{
display: flex;
gap: 20px;
margin: 10px 0;
font-size: 12px;
color: var(--muted);
}}
.company {{
margin-bottom: 25px;
}}
.company-header {{
display: flex;
align-items: baseline;
gap: 10px;
padding: 8px 0;
border-bottom: 1px solid var(--border);
cursor: pointer;
}}
.company-header:hover {{
color: var(--accent);
}}
.company-name {{
font-weight: bold;
color: var(--accent);
}}
.company-count {{
color: var(--muted);
font-size: 12px;
}}
.jobs {{
margin-left: 0;
}}
.job {{
padding: 6px 0;
border-bottom: 1px solid var(--border);
display: flex;
justify-content: space-between;
gap: 20px;
align-items: baseline;
}}
.job:last-child {{
border-bottom: none;
}}
.job:hover {{
background: var(--highlight);
}}
.job-title {{
overflow: hidden;
text-overflow: ellipsis;
}}
.job-title a {{
color: var(--fg);
text-decoration: none;
}}
.job-title a:hover {{
color: var(--accent);
text-decoration: underline;
}}
.job-location {{
color: var(--muted);
font-size: 12px;
text-align: right;
white-space: nowrap;
flex-shrink: 0;
}}
.tag {{
display: inline-block;
padding: 2px 6px;
border-radius: 3px;
font-size: 11px;
margin-left: 5px;
}}
.tag-remote {{
background: #1a4a1a;
color: #4ade80;
}}
.tag-canada {{
background: #4a1a1a;
color: #f87171;
}}
.tag-berlin {{
background: #4a4a1a;
color: #facc15;
}}
.tag-emea {{
background: #1a3a4a;
color: #60a5fa;
}}
.tag-americas {{
background: #3a1a4a;
color: #c084fc;
}}
.tag-worldwide {{
background: #1a4a3a;
color: #34d399;
}}
.hidden {{
display: none;
}}
.toc {{
margin: 20px 0;
padding: 15px;
background: var(--highlight);
border-radius: 4px;
}}
.toc-title {{
font-size: 12px;
color: var(--muted);
margin-bottom: 10px;
}}
.toc-links {{
display: flex;
flex-wrap: wrap;
gap: 10px;
}}
.toc-links a {{
color: var(--accent);
text-decoration: none;
font-size: 13px;
}}
.toc-links a:hover {{
text-decoration: underline;
}}
.toc-links .empty {{
color: var(--muted);
cursor: default;
}}
.toc-links .empty:hover {{
text-decoration: none;
}}
.filter-buttons {{
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 10px;
}}
.filter-btn {{
background: var(--bg);
border: 1px solid var(--border);
color: var(--muted);
padding: 4px 12px;
font-family: inherit;
font-size: 12px;
border-radius: 4px;
cursor: pointer;
transition: all 0.15s;
}}
.filter-btn:hover {{
border-color: var(--accent);
color: var(--fg);
}}
.filter-btn.active {{
background: var(--accent);
border-color: var(--accent);
color: var(--bg);
}}
</style>
</head>
<body>
<header>
<h1>$ job-board</h1>
<div class="meta">
Last updated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} |
{total_shown}/{total_scraped} jobs (location filtered) | Monitoring {len(all_company_names)} companies
</div>
</header>
<div class="filters">
<input type="text" id="search" placeholder="Filter jobs... (e.g. 'senior engineer', 'remote', 'canada')" autofocus>
<div class="filter-buttons">
<button class="filter-btn" data-filter="">All</button>
<button class="filter-btn" data-filter="engineer">Engineering</button>
<button class="filter-btn" data-filter="senior engineer">Senior Eng</button>
<button class="filter-btn" data-filter="staff principal">Staff+</button>
<button class="filter-btn" data-filter="manager director">Management</button>
<button class="filter-btn" data-filter="product">Product</button>
<button class="filter-btn" data-filter="design">Design</button>
<button class="filter-btn" data-filter="security">Security</button>
<button class="filter-btn" data-filter="remote">Remote</button>
<button class="filter-btn" data-filter="canada">Canada</button>
<button class="filter-btn" data-filter="germany">Germany</button>
<button class="filter-btn" data-filter="emea">EMEA</button>
<button class="filter-btn" data-filter="americas">Americas</button>
<button class="filter-btn" data-filter="worldwide">Worldwide</button>
</div>
<div class="stats">
<span id="visible-count">{total_shown} jobs shown</span>
</div>
</div>
<div class="toc">
<div class="toc-title">Jump to company:</div>
<div class="toc-links">
"""
# Table of contents
for company_name, company_jobs in sorted_companies:
anchor = company_name.lower().replace(" ", "-")
filtered = len(company_jobs)
total = total_per_company.get(company_name, 0)
if filtered > 0:
html += f' <a href="#{anchor}">{company_name} ({filtered}/{total})</a>\n'
else:
html += f' <span class="empty">{company_name} (0/{total})</span>\n'
html += """ </div>
</div>
<main id="job-list">
"""
# Job listings (only for companies with jobs)
for company_name, company_jobs in sorted_companies:
if not company_jobs:
continue # Skip companies with no jobs after filtering
anchor = company_name.lower().replace(" ", "-")
html += f"""
<div class="company" id="{anchor}">
<div class="company-header">
<span class="company-name">{company_name}</span>
<span class="company-count">{len(company_jobs)} positions</span>
</div>
<div class="jobs">
"""
for job in sorted(company_jobs, key=lambda j: j.title):
location = job.location or ""
location_lower = location.lower()
# Extract tags and short location
tag_list, short_loc = extract_location_tags(location, job.remote_type)
# Build tag HTML
tags = ""
if "remote" in tag_list:
tags += '<span class="tag tag-remote">remote</span>'
if "canada" in tag_list:
tags += '<span class="tag tag-canada">canada</span>'
if "germany" in tag_list:
tags += '<span class="tag tag-berlin">germany</span>'
if "emea" in tag_list:
tags += '<span class="tag tag-emea">emea</span>'
if "americas" in tag_list:
tags += '<span class="tag tag-americas">americas</span>'
if "worldwide" in tag_list:
tags += '<span class="tag tag-worldwide">worldwide</span>'
html += f""" <div class="job" data-search="{job.title.lower()} {location_lower} {(job.department or '').lower()} {' '.join(tag_list)}">
<span class="job-title"><a href="{job.url}" target="_blank">{job.title}</a>{tags}</span>
<span class="job-location">{short_loc}</span>
</div>
"""
html += """ </div>
</div>
"""
html += """ </main>
<script>
const search = document.getElementById('search');
const jobs = document.querySelectorAll('.job');
const companies = document.querySelectorAll('.company');
const visibleCount = document.getElementById('visible-count');
const filterBtns = document.querySelectorAll('.filter-btn');
function filterJobs(query) {
let visible = 0;
const terms = query.toLowerCase().trim().split(/\\s+/).filter(t => t);
companies.forEach(company => {
const companyJobs = company.querySelectorAll('.job');
let companyVisible = 0;
companyJobs.forEach(job => {
const searchText = job.dataset.search;
// Match if ANY term matches (OR logic for filter buttons)
const matches = terms.length === 0 || terms.some(term => searchText.includes(term));
job.classList.toggle('hidden', !matches);
if (matches) {
companyVisible++;
visible++;
}
});
company.classList.toggle('hidden', companyVisible === 0);
});
visibleCount.textContent = `${visible} jobs shown`;
}
search.addEventListener('input', (e) => {
// Clear active button when typing
filterBtns.forEach(btn => btn.classList.remove('active'));
filterJobs(e.target.value);
});
// Filter buttons
filterBtns.forEach(btn => {
btn.addEventListener('click', () => {
const filter = btn.dataset.filter;
search.value = filter;
filterBtns.forEach(b => b.classList.remove('active'));
btn.classList.add('active');
filterJobs(filter);
});
});
// Keyboard shortcut: / to focus search
document.addEventListener('keydown', (e) => {
if (e.key === '/' && document.activeElement !== search) {
e.preventDefault();
search.focus();
}
if (e.key === 'Escape') {
search.value = '';
filterBtns.forEach(b => b.classList.remove('active'));
filterJobs('');
search.blur();
}
});
// Set "All" as active by default
filterBtns[0].classList.add('active');
</script>
</body>
</html>
"""
# Write the file
output = Path(output_path)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(html)
print(f"Dashboard generated: {output_path}")
return output_path
if __name__ == "__main__":
generate_dashboard()