Thursday, November 20, 2025

A GUI Web Browser using only Python's standard library (Tahsin's First Browser)

 

#!/usr/bin/env python3

# PyTkBrowser: A minimal GUI web browser using only Python's standard library.

# Features:

# - Tkinter GUI (address bar, back/forward/reload, status bar)

# - HTTP/HTTPS GET via urllib

# - Basic HTML-to-text rendering (skips script/style)

# - Clickable links in the rendered page

# - Back/forward history, redirects handling

#

# Limitations:

# - No JavaScript/CSS layout

# - Images, forms, cookies are minimal/not supported

# - Rendering is plain text with numbered links


import tkinter as tk

from tkinter import ttk, messagebox

import urllib.request

import urllib.parse

import urllib.error

from html.parser import HTMLParser

import html

import re

import sys

from collections import deque


USER_AGENT = "PyTkBrowser/0.1 (stdlib-only)"

TIMEOUT = 20



class SimpleRenderer(HTMLParser):

    """Very basic HTML to text renderer with link extraction."""

    def __init__(self, base_url=None):

        super().__init__()

        self.base_url = base_url

        self.in_script = False

        self.in_style = False

        self.text_chunks = []

        self.links = []  # list of (label, url)

        self.current_link = None

        self.list_level = 0

        self.title = None

        self.in_title = False


    def handle_starttag(self, tag, attrs):

        tag = tag.lower()

        if tag == "script":

            self.in_script = True

        elif tag == "style":

            self.in_style = True

        elif tag in ("p", "div", "section", "article"):

            self.text_chunks.append("\n")

        elif tag in ("br",):

            self.text_chunks.append("\n")

        elif tag in ("h1", "h2", "h3", "h4", "h5", "h6"):

            self.text_chunks.append("\n")

        elif tag in ("ul", "ol"):

            self.list_level += 1

        elif tag == "li":

            self.text_chunks.append("  " * max(0, self.list_level - 1) + "• ")

        elif tag == "a":

            href = None

            for k, v in attrs:

                if k.lower() == "href":

                    href = v

                    break

            if href:

                abs_url = urllib.parse.urljoin(self.base_url or "", href)

                self.current_link = {"url": abs_url, "text": ""}

        elif tag == "title":

            self.in_title = True


    def handle_endtag(self, tag):

        tag = tag.lower()

        if tag == "script":

            self.in_script = False

        elif tag == "style":

            self.in_style = False

        elif tag in ("p", "div", "section", "article", "h1", "h2", "h3", "h4", "h5", "h6"):

            self.text_chunks.append("\n")

        elif tag in ("ul", "ol"):

            self.list_level = max(0, self.list_level - 1)

        elif tag == "a":

            if self.current_link:

                text = self.current_link["text"].strip() or self.current_link["url"]

                self.links.append((text, self.current_link["url"]))

                idx = len(self.links)

                # append link marker

                self.text_chunks.append(f" [{idx}]")

                self.current_link = None

        elif tag == "title":

            self.in_title = False


    def handle_data(self, data):

        if self.in_script or self.in_style:

            return

        cleaned = data.replace("\r", " ").replace("\n", " ")

        cleaned = re.sub(r"\s+", " ", cleaned)

        if cleaned.strip():

            if self.in_title:

                # accumulate title

                if self.title is None:

                    self.title = cleaned.strip()

                else:

                    self.title += cleaned.strip()

            if self.current_link is not None:

                self.current_link["text"] += cleaned

                self.text_chunks.append(cleaned)

            else:

                self.text_chunks.append(cleaned)


    def handle_entityref(self, name):

        self.handle_data(html.unescape(f"&{name};"))


    def handle_charref(self, name):

        try:

            ch = chr(int(name[1:], 16)) if name.startswith("x") else chr(int(name))

        except ValueError:

            ch = ""

        self.handle_data(ch)


    def get_text(self):

        content = "".join(self.text_chunks)

        content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)

        return content.strip()



class BrowserModel:

    """Networking and history management."""

    def __init__(self):

        self.history_back = deque()

        self.history_forward = deque()

        self.current_url = None

        self.current_text = ""

        self.current_links = []

        self.title = ""


    def normalize_url(self, url: str) -> str:

        if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", url):

            return "http://" + url

        return url


    def fetch(self, url: str):

        url = self.normalize_url(url)

        req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})

        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:

            final_url = resp.geturl()

            charset = self._get_charset(resp)

            data = resp.read()

            try:

                text = data.decode(charset, errors="replace")

            except LookupError:

                text = data.decode("utf-8", errors="replace")

            return final_url, text


    def _get_charset(self, resp) -> str:

        ct = resp.headers.get("Content-Type", "")

        m = re.search(r"charset=([\w\-]+)", ct, re.IGNORECASE)

        return m.group(1) if m else "utf-8"


    def render(self, url: str, html_text: str):

        renderer = SimpleRenderer(base_url=url)

        try:

            renderer.feed(html_text)

        except Exception:

            pass  # best-effort

        text = renderer.get_text()

        self.current_text = text

        self.current_links = renderer.links

        self.title = renderer.title or url

        return text, renderer.links, self.title


    def open(self, url: str):

        final_url, html_text = self.fetch(url)

        text, links, title = self.render(final_url, html_text)

        if self.current_url and self.current_url != final_url:

            self.history_back.append(self.current_url)

            self.history_forward.clear()

        self.current_url = final_url

        return final_url, text, links, title


    def back(self):

        if not self.history_back:

            return None

        self.history_forward.appendleft(self.current_url)

        target = self.history_back.pop()

        return self.open_direct(target)


    def forward(self):

        if not self.history_forward:

            return None

        target = self.history_forward.popleft()

        return self.open_direct(target)


    def open_direct(self, url: str):

        # open without modifying back stack further

        final_url, html_text = self.fetch(url)

        text, links, title = self.render(final_url, html_text)

        self.current_url = final_url

        return final_url, text, links, title



class PyTkBrowser(tk.Tk):

    def __init__(self):

        super().__init__()

        self.title("PyTkBrowser")

        self.geometry("900x600")

        self.model = BrowserModel()

        self._build_ui()


    def _build_ui(self):

        # Top bar

        top = ttk.Frame(self)

        top.pack(side=tk.TOP, fill=tk.X)


        self.btn_back = ttk.Button(top, text="◀ Back", width=8, command=self.on_back)

        self.btn_forward = ttk.Button(top, text="Forward ▶", width=10, command=self.on_forward)

        self.btn_reload = ttk.Button(top, text="Reload", width=8, command=self.on_reload)

        self.addr_var = tk.StringVar()

        self.addr_entry = ttk.Entry(top, textvariable=self.addr_var)

        self.btn_go = ttk.Button(top, text="Go", width=5, command=self.on_go)


        self.btn_back.pack(side=tk.LEFT, padx=4, pady=4)

        self.btn_forward.pack(side=tk.LEFT, padx=4, pady=4)

        self.btn_reload.pack(side=tk.LEFT, padx=4, pady=4)

        self.addr_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=4, pady=4)

        self.btn_go.pack(side=tk.LEFT, padx=4, pady=4)


        # Content area

        content = ttk.Frame(self)

        content.pack(side=tk.TOP, fill=tk.BOTH, expand=True)


        self.text = tk.Text(content, wrap="word")

        self.text.configure(font=("Helvetica", 12))

        self.text_scroll = ttk.Scrollbar(content, orient=tk.VERTICAL, command=self.text.yview)

        self.text.configure(yscrollcommand=self.text_scroll.set)


        self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

        self.text_scroll.pack(side=tk.LEFT, fill=tk.Y)


        # Status bar

        status = ttk.Frame(self)

        status.pack(side=tk.BOTTOM, fill=tk.X)

        self.status_var = tk.StringVar(value="Ready")

        self.status_label = ttk.Label(status, textvariable=self.status_var, anchor="w")

        self.status_label.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6, pady=3)


        # Text tags for links

        self.text.tag_configure("link", foreground="#0645AD", underline=True)

        self.text.tag_bind("link", "<Button-1>", self.on_link_click)

        self.text.tag_bind("link", "<Enter>", lambda e: self.text.config(cursor="hand2"))

        self.text.tag_bind("link", "<Leave>", lambda e: self.text.config(cursor=""))


        # Keyboard shortcuts

        self.bind("<Return>", lambda e: self.on_go())

        self.addr_entry.focus_set()


    def set_status(self, msg):

        self.status_var.set(msg)

        self.update_idletasks()


    def on_go(self):

        url = self.addr_var.get().strip()

        if not url:

            return

        self.set_status(f"Loading {url} ...")

        try:

            final_url, text, links, title = self.model.open(url)

            self.addr_var.set(final_url)

            self.render_page(final_url, text, links, title)

            self.set_status(f"Loaded: {final_url}")

        except urllib.error.HTTPError as e:

            self.render_error(f"HTTP Error {e.code}: {e.reason}")

        except urllib.error.URLError as e:

            self.render_error(f"URL Error: {e.reason}")

        except Exception as e:

            self.render_error(f"Error: {e}")


    def on_back(self):

        self.set_status("Going back...")

        result = None

        try:

            result = self.model.back()

        except Exception as e:

            self.render_error(f"Error: {e}")

            return

        if result:

            final_url, text, links, title = result

            self.addr_var.set(final_url)

            self.render_page(final_url, text, links, title)

            self.set_status(f"Loaded: {final_url}")

        else:

            self.set_status("No history.")


    def on_forward(self):

        self.set_status("Going forward...")

        result = None

        try:

            result = self.model.forward()

        except Exception as e:

            self.render_error(f"Error: {e}")

            return

        if result:

            final_url, text, links, title = result

            self.addr_var.set(final_url)

            self.render_page(final_url, text, links, title)

            self.set_status(f"Loaded: {final_url}")

        else:

            self.set_status("No forward history.")


    def on_reload(self):

        if not self.model.current_url:

            self.set_status("No page loaded.")

            return

        self.set_status("Reloading...")

        try:

            final_url, text, links, title = self.model.open_direct(self.model.current_url)

            self.addr_var.set(final_url)

            self.render_page(final_url, text, links, title)

            self.set_status(f"Reloaded: {final_url}")

        except Exception as e:

            self.render_error(f"Error: {e}")


    def render_error(self, msg):

        self.text.delete("1.0", tk.END)

        self.text.insert(tk.END, msg)

        self.set_status(msg)


    def render_page(self, url, text, links, title):

        self.title(f"PyTkBrowser - {title}")

        self.text.delete("1.0", tk.END)


        # Insert main text

        self.text.insert(tk.END, text + "\n\n")


        # Insert links section

        if links:

            self.text.insert(tk.END, "Links:\n")

            for i, (label, target) in enumerate(links, start=1):

                start_index = self.text.index(tk.END)

                line = f"  [{i}] {label}\n"

                self.text.insert(tk.END, line)

                # Tag only the label part as clickable

                # Compute tag range within the inserted line

                # Start of label: after "  [i] "

                label_start = f"{float(start_index.split('.')[0])}.{int(start_index.split('.')[1]) + len(f'  [{i}] ')}"

                label_end = f"{float(label_start.split('.')[0])}.{int(label_start.split('.')[1]) + len(label)}"

                # Fallback if indices computation is messy: tag the whole line

                try:

                    self.text.tag_add("link", label_start, label_end)

                except Exception:

                    # Tag the entire line

                    line_start = start_index

                    line_end = self.text.index(tk.END)

                    self.text.tag_add("link", line_start, line_end)

                # Store URL in a separate per-line tag

                tag_name = f"link_{i}"

                self.text.tag_add(tag_name, start_index, self.text.index(tk.END))

                # Bind click for this tag to open target

                self.text.tag_bind(tag_name, "<Button-1>", lambda e, t=target: self.open_url(t))

        else:

            self.text.insert(tk.END, "No links found.\n")


    def on_link_click(self, event):

        # Fallback: not used because we bind per-link tags with URL

        pass


    def open_url(self, target):

        self.addr_var.set(target)

        self.on_go()



def main():

    app = PyTkBrowser()

    # Optionally load a URL from command line

    if len(sys.argv) > 1:

        app.addr_var.set(sys.argv[1])

        app.on_go()

    app.mainloop()



if __name__ == "__main__":

    main()

No comments:

Post a Comment

Support Vector Machines in Machine Learning

Support Vector Machines in Machine Learning Introduction Support Vector Machines (SVMs) are powerful supervised learning algorithms used ...