Skip to main content

babyrite/expand/
github.rs

1//! GitHub Permalink expansion.
2//!
3//! This module provides functionality for parsing GitHub permalink URLs
4//! and fetching raw file content to display as code blocks.
5
6use regex::Regex;
7use std::collections::HashSet;
8use std::sync::LazyLock;
9
10use super::{ExpandError, ExpandedContent};
11use crate::config::BabyriteConfig;
12use crate::utils::language_from_extension;
13
14/// Regex pattern for matching GitHub blob URLs.
15///
16/// Captures: owner, repo, git_ref (commit SHA or branch name), path, and optional line range fragment.
17///
18/// Supported patterns:
19/// - `https://github.com/{owner}/{repo}/blob/{ref}/{path}`
20/// - `https://github.com/{owner}/{repo}/blob/{ref}/{path}#L{line}`
21/// - `https://github.com/{owner}/{repo}/blob/{ref}/{path}#L{start}-L{end}`
22///
23/// The `{ref}` can be a commit SHA (e.g., `abcdef1234567`) or a branch/tag name (e.g., `main`, `feature/foo`).
24static GITHUB_PERMALINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
25    Regex::new(
26        r"https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/([^#\s]+)(?:#L(\d+)(?:-L(\d+))?)?",
27    )
28    .unwrap()
29});
30
31/// A parsed GitHub permalink.
32#[derive(Debug)]
33pub struct GitHubPermalink {
34    /// Repository owner.
35    pub owner: String,
36    /// Repository name.
37    pub repo: String,
38    /// Git ref (commit SHA or branch/tag name).
39    pub git_ref: String,
40    /// File path within the repository.
41    pub path: String,
42    /// Optional line range specification.
43    pub line_range: Option<LineRange>,
44}
45
46/// A line range extracted from a GitHub permalink fragment.
47#[derive(Debug, Clone, Copy)]
48pub struct LineRange {
49    /// Start line (1-indexed).
50    pub start: usize,
51    /// End line (1-indexed, inclusive). Same as `start` for single-line references.
52    pub end: usize,
53}
54
55/// Errors that can occur when expanding a GitHub permalink.
56#[derive(thiserror::Error, Debug)]
57pub enum GitHubExpandError {
58    /// Failed to fetch the raw file content.
59    #[error("Failed to fetch raw content: {0}")]
60    Fetch(String),
61    /// The fetched content exceeds the maximum allowed size.
62    #[error("Content exceeds size limit")]
63    ContentTooLarge,
64    /// An HTTP error occurred.
65    #[error(transparent)]
66    Http(#[from] reqwest::Error),
67}
68
69impl GitHubPermalink {
70    /// Parses all GitHub permalink URLs from the given text.
71    ///
72    /// Matches URLs with commit SHAs, branch names, or tag names.
73    ///
74    /// Note: Duplicate URLs are ignored, and a maximum of 3 links are returned.
75    pub fn parse_all(text: &str) -> Vec<GitHubPermalink> {
76        let mut seen_urls = HashSet::new();
77        GITHUB_PERMALINK_REGEX
78            .captures_iter(text)
79            .filter_map(|captures| {
80                let m = captures.get(0)?;
81                let full_url = m.as_str();
82                // Skip URLs wrapped in angle brackets (e.g., <https://...>)
83                if m.start() > 0 && text.as_bytes()[m.start() - 1] == b'<' {
84                    return None;
85                }
86                if !seen_urls.insert(full_url.to_string()) {
87                    return None;
88                }
89
90                let owner = captures.get(1)?.as_str().to_string();
91                let repo = captures.get(2)?.as_str().to_string();
92                let git_ref = captures.get(3)?.as_str().to_string();
93                let path = captures.get(4)?.as_str().to_string();
94
95                let line_range = match (captures.get(5), captures.get(6)) {
96                    (Some(start), Some(end)) => {
97                        let s = start.as_str().parse().ok()?;
98                        let e = end.as_str().parse().ok()?;
99                        Some(LineRange { start: s, end: e })
100                    }
101                    (Some(start), None) => {
102                        let s = start.as_str().parse().ok()?;
103                        Some(LineRange { start: s, end: s })
104                    }
105                    _ => None,
106                };
107
108                Some(GitHubPermalink {
109                    owner,
110                    repo,
111                    git_ref,
112                    path,
113                    line_range,
114                })
115            })
116            .take(3) // Limit to maximum 3 links
117            .collect()
118    }
119
120    /// Fetches the raw file content from GitHub and returns a code block.
121    #[cfg_attr(coverage_nightly, coverage(off))]
122    pub async fn fetch(
123        &self,
124        http_client: &reqwest::Client,
125    ) -> Result<ExpandedContent, ExpandError> {
126        let config = BabyriteConfig::get();
127        let max_lines = config.github.max_lines;
128
129        let raw_url = format!(
130            "https://raw.githubusercontent.com/{}/{}/{}/{}",
131            self.owner, self.repo, self.git_ref, self.path
132        );
133
134        let response = http_client
135            .get(&raw_url)
136            .send()
137            .await
138            .map_err(GitHubExpandError::Http)?;
139
140        if !response.status().is_success() {
141            return Err(GitHubExpandError::Fetch(format!(
142                "HTTP {} for {}",
143                response.status(),
144                raw_url
145            ))
146            .into());
147        }
148
149        let content_length = response.content_length().unwrap_or(0);
150        // 1 MB limit to avoid fetching huge files
151        if content_length > 1_048_576 {
152            return Err(GitHubExpandError::ContentTooLarge.into());
153        }
154
155        let body = response.text().await.map_err(GitHubExpandError::Http)?;
156
157        Ok(self.build_code_block(&body, max_lines))
158    }
159
160    /// Builds an `ExpandedContent::CodeBlock` from raw file content.
161    fn build_code_block(&self, body: &str, max_lines: usize) -> ExpandedContent {
162        let all_lines: Vec<&str> = body.lines().collect();
163        let (code, line_info) = match self.line_range {
164            Some(range) => {
165                let start = range.start.saturating_sub(1); // 0-indexed
166                let end = range.end.min(all_lines.len());
167                let selected: Vec<&str> = all_lines.get(start..end).unwrap_or_default().to_vec();
168
169                let (code, truncated) = truncate_lines(&selected, max_lines);
170                let info = if truncated {
171                    format!(
172                        "L{}-L{}, truncated to {} lines",
173                        range.start, range.end, max_lines
174                    )
175                } else {
176                    format!("L{}-L{}", range.start, range.end)
177                };
178                (code, info)
179            }
180            None => {
181                let (code, truncated) = truncate_lines(&all_lines, max_lines);
182                let info = if truncated {
183                    format!("truncated to {} lines", max_lines)
184                } else {
185                    String::new()
186                };
187                (code, info)
188            }
189        };
190
191        let display_ref = shorten_ref(&self.git_ref);
192        let language = language_for_path(&self.path);
193
194        let metadata = if line_info.is_empty() {
195            format!(
196                "`{}` - {}/{}@{}",
197                self.path, self.owner, self.repo, display_ref
198            )
199        } else {
200            format!(
201                "`{}` ({}) - {}/{}@{}",
202                self.path, line_info, self.owner, self.repo, display_ref
203            )
204        };
205
206        ExpandedContent::CodeBlock {
207            language: language.to_string(),
208            code,
209            metadata,
210        }
211    }
212}
213
214/// Returns true if the given string looks like a commit SHA (4-40 hex characters).
215fn is_commit_sha(s: &str) -> bool {
216    (4..=40).contains(&s.len()) && s.bytes().all(|b| b.is_ascii_hexdigit())
217}
218
219/// Shortens a git ref for display. Commit SHAs are truncated to 7 characters;
220/// branch/tag names are returned as-is.
221fn shorten_ref(git_ref: &str) -> &str {
222    if is_commit_sha(git_ref) {
223        &git_ref[..7.min(git_ref.len())]
224    } else {
225        git_ref
226    }
227}
228
229/// Extracts the filename from a path and returns the appropriate language identifier.
230fn language_for_path(path: &str) -> &str {
231    let filename = path.rsplit('/').next().unwrap_or(path);
232    match filename.rsplit_once('.') {
233        Some((_, ext)) => language_from_extension(ext),
234        None => language_from_extension(filename),
235    }
236}
237
238/// Truncates lines to the given maximum, returning the joined string and whether truncation occurred.
239fn truncate_lines(lines: &[&str], max: usize) -> (String, bool) {
240    if lines.len() > max {
241        let truncated: Vec<&str> = lines[..max].to_vec();
242        (truncated.join("\n"), true)
243    } else {
244        (lines.join("\n"), false)
245    }
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    // --- truncate_lines ---
253
254    #[test]
255    fn truncate_lines_under_limit() {
256        let lines = vec!["a", "b", "c"];
257        let (result, truncated) = truncate_lines(&lines, 5);
258        assert_eq!(result, "a\nb\nc");
259        assert!(!truncated);
260    }
261
262    #[test]
263    fn truncate_lines_at_limit() {
264        let lines = vec!["a", "b", "c"];
265        let (result, truncated) = truncate_lines(&lines, 3);
266        assert_eq!(result, "a\nb\nc");
267        assert!(!truncated);
268    }
269
270    #[test]
271    fn truncate_lines_over_limit() {
272        let lines = vec!["a", "b", "c", "d", "e"];
273        let (result, truncated) = truncate_lines(&lines, 2);
274        assert_eq!(result, "a\nb");
275        assert!(truncated);
276    }
277
278    #[test]
279    fn truncate_lines_empty() {
280        let lines: Vec<&str> = vec![];
281        let (result, truncated) = truncate_lines(&lines, 5);
282        assert_eq!(result, "");
283        assert!(!truncated);
284    }
285
286    // --- GitHubPermalink::parse_all ---
287
288    #[test]
289    fn parse_basic_permalink() {
290        let text = "https://github.com/owner/repo/blob/abcdef1234567890abcdef1234567890abcdef12/src/main.rs";
291        let results = GitHubPermalink::parse_all(text);
292        assert_eq!(results.len(), 1);
293        assert_eq!(results[0].owner, "owner");
294        assert_eq!(results[0].repo, "repo");
295        assert_eq!(
296            results[0].git_ref,
297            "abcdef1234567890abcdef1234567890abcdef12"
298        );
299        assert_eq!(results[0].path, "src/main.rs");
300        assert!(results[0].line_range.is_none());
301    }
302
303    #[test]
304    fn parse_permalink_with_single_line() {
305        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L42";
306        let results = GitHubPermalink::parse_all(text);
307        assert_eq!(results.len(), 1);
308        let range = results[0].line_range.unwrap();
309        assert_eq!(range.start, 42);
310        assert_eq!(range.end, 42);
311    }
312
313    #[test]
314    fn parse_permalink_with_line_range() {
315        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L10-L20";
316        let results = GitHubPermalink::parse_all(text);
317        assert_eq!(results.len(), 1);
318        let range = results[0].line_range.unwrap();
319        assert_eq!(range.start, 10);
320        assert_eq!(range.end, 20);
321    }
322
323    #[test]
324    fn parse_branch_name() {
325        let text = "https://github.com/owner/repo/blob/main/src/lib.rs";
326        let results = GitHubPermalink::parse_all(text);
327        assert_eq!(results.len(), 1);
328        assert_eq!(results[0].git_ref, "main");
329        assert_eq!(results[0].path, "src/lib.rs");
330    }
331
332    #[test]
333    fn parse_branch_name_with_line_range() {
334        let text = "https://github.com/owner/repo/blob/develop/src/main.rs#L5-L10";
335        let results = GitHubPermalink::parse_all(text);
336        assert_eq!(results.len(), 1);
337        assert_eq!(results[0].git_ref, "develop");
338        let range = results[0].line_range.unwrap();
339        assert_eq!(range.start, 5);
340        assert_eq!(range.end, 10);
341    }
342
343    #[test]
344    fn parse_branch_name_with_single_line() {
345        let text = "https://github.com/owner/repo/blob/main/src/lib.rs#L5";
346        let results = GitHubPermalink::parse_all(text);
347        assert_eq!(results.len(), 1);
348        assert_eq!(results[0].git_ref, "main");
349        let range = results[0].line_range.unwrap();
350        assert_eq!(range.start, 5);
351        assert_eq!(range.end, 5);
352    }
353
354    #[test]
355    fn parse_branch_with_special_characters() {
356        let cases = [
357            (
358                "https://github.com/o/r/blob/release-v1.0/f.rs",
359                "release-v1.0",
360            ),
361            (
362                "https://github.com/o/r/blob/feat_something/f.rs",
363                "feat_something",
364            ),
365            ("https://github.com/o/r/blob/v2.0.0/f.rs", "v2.0.0"),
366        ];
367        for (text, expected_ref) in cases {
368            let results = GitHubPermalink::parse_all(text);
369            assert_eq!(results.len(), 1, "failed for: {text}");
370            assert_eq!(results[0].git_ref, expected_ref);
371        }
372    }
373
374    #[test]
375    fn parse_tag_name() {
376        let text = "https://github.com/owner/repo/blob/v1.0.0/src/main.rs#L1-L10";
377        let results = GitHubPermalink::parse_all(text);
378        assert_eq!(results.len(), 1);
379        assert_eq!(results[0].git_ref, "v1.0.0");
380        let range = results[0].line_range.unwrap();
381        assert_eq!(range.start, 1);
382        assert_eq!(range.end, 10);
383    }
384
385    #[test]
386    fn parse_mixed_sha_and_branch() {
387        let text = "https://github.com/o/r/blob/abcd1234/a.rs \
388                    https://github.com/o/r/blob/main/b.rs";
389        let results = GitHubPermalink::parse_all(text);
390        assert_eq!(results.len(), 2);
391        assert_eq!(results[0].git_ref, "abcd1234");
392        assert_eq!(results[1].git_ref, "main");
393    }
394
395    #[test]
396    fn parse_accepts_short_ref() {
397        // Short refs (e.g., short branch names) should still match
398        let text = "https://github.com/owner/repo/blob/abc/src/lib.rs";
399        let results = GitHubPermalink::parse_all(text);
400        assert_eq!(results.len(), 1);
401        assert_eq!(results[0].git_ref, "abc");
402    }
403
404    #[test]
405    fn parse_deduplicates_urls() {
406        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs \
407                    https://github.com/owner/repo/blob/abcd1234/src/lib.rs";
408        let results = GitHubPermalink::parse_all(text);
409        assert_eq!(results.len(), 1);
410    }
411
412    #[test]
413    fn parse_limits_to_three() {
414        let text = "\
415            https://github.com/o/r/blob/aaaa1111/a.rs \
416            https://github.com/o/r/blob/bbbb2222/b.rs \
417            https://github.com/o/r/blob/cccc3333/c.rs \
418            https://github.com/o/r/blob/dddd4444/d.rs";
419        let results = GitHubPermalink::parse_all(text);
420        assert_eq!(results.len(), 3);
421    }
422
423    #[test]
424    fn parse_multiple_different_urls() {
425        let text = "Check https://github.com/a/b/blob/1111aaaa/x.rs#L1 and \
426                    https://github.com/c/d/blob/2222bbbb/y.py#L5-L10";
427        let results = GitHubPermalink::parse_all(text);
428        assert_eq!(results.len(), 2);
429        assert_eq!(results[0].owner, "a");
430        assert_eq!(results[1].owner, "c");
431        assert_eq!(results[1].path, "y.py");
432    }
433
434    #[test]
435    fn parse_no_match() {
436        let text = "Hello, no links here!";
437        let results = GitHubPermalink::parse_all(text);
438        assert!(results.is_empty());
439    }
440
441    #[test]
442    fn parse_ignores_angle_bracket_link() {
443        let text = "<https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L10-L20>";
444        let results = GitHubPermalink::parse_all(text);
445        assert!(results.is_empty());
446    }
447
448    #[test]
449    fn parse_nested_path() {
450        let text = "https://github.com/owner/repo/blob/abcd1234/src/deeply/nested/path/file.rs";
451        let results = GitHubPermalink::parse_all(text);
452        assert_eq!(results.len(), 1);
453        assert_eq!(results[0].path, "src/deeply/nested/path/file.rs");
454    }
455
456    #[test]
457    fn parse_short_commit_sha() {
458        // 4-character SHA is the minimum
459        let text = "https://github.com/owner/repo/blob/abcd/file.rs";
460        let results = GitHubPermalink::parse_all(text);
461        assert_eq!(results.len(), 1);
462        assert_eq!(results[0].git_ref, "abcd");
463    }
464
465    // --- language_for_path ---
466
467    #[test]
468    fn language_for_path_basic_extension() {
469        assert_eq!(language_for_path("src/main.rs"), "rust");
470    }
471
472    #[test]
473    fn language_for_path_dockerfile_in_subdir() {
474        assert_eq!(language_for_path("docker/Dockerfile"), "dockerfile");
475    }
476
477    #[test]
478    fn language_for_path_dotted_directory() {
479        assert_eq!(language_for_path("some.config/Dockerfile"), "dockerfile");
480    }
481
482    #[test]
483    fn language_for_path_makefile_in_subdir() {
484        assert_eq!(language_for_path("build/Makefile"), "makefile");
485    }
486
487    #[test]
488    fn language_for_path_multiple_dots() {
489        assert_eq!(language_for_path("file.test.ts"), "typescript");
490    }
491
492    #[test]
493    fn language_for_path_dotfile() {
494        assert_eq!(language_for_path(".gitignore"), "gitignore");
495    }
496
497    // --- build_code_block ---
498
499    fn make_permalink(path: &str, line_range: Option<LineRange>) -> GitHubPermalink {
500        GitHubPermalink {
501            owner: "owner".to_string(),
502            repo: "repo".to_string(),
503            git_ref: "abcdef1234567".to_string(),
504            path: path.to_string(),
505            line_range,
506        }
507    }
508
509    #[test]
510    fn build_code_block_full_file() {
511        let permalink = make_permalink("src/main.rs", None);
512        let body = "fn main() {\n    println!(\"hello\");\n}";
513        let result = permalink.build_code_block(body, 50);
514
515        match result {
516            ExpandedContent::CodeBlock {
517                language,
518                code,
519                metadata,
520            } => {
521                assert_eq!(language, "rust");
522                assert_eq!(code, body);
523                assert_eq!(metadata, "`src/main.rs` - owner/repo@abcdef1");
524            }
525            _ => panic!("expected CodeBlock"),
526        }
527    }
528
529    #[test]
530    fn build_code_block_with_line_range() {
531        let permalink = make_permalink("src/lib.rs", Some(LineRange { start: 2, end: 3 }));
532        let body = "line1\nline2\nline3\nline4";
533        let result = permalink.build_code_block(body, 50);
534
535        match result {
536            ExpandedContent::CodeBlock {
537                language,
538                code,
539                metadata,
540            } => {
541                assert_eq!(language, "rust");
542                assert_eq!(code, "line2\nline3");
543                assert!(metadata.contains("L2-L3"));
544            }
545            _ => panic!("expected CodeBlock"),
546        }
547    }
548
549    #[test]
550    fn build_code_block_truncated() {
551        let permalink = make_permalink("app.py", None);
552        let body = "a\nb\nc\nd\ne";
553        let result = permalink.build_code_block(body, 2);
554
555        match result {
556            ExpandedContent::CodeBlock { code, metadata, .. } => {
557                assert_eq!(code, "a\nb");
558                assert!(metadata.contains("truncated to 2 lines"));
559            }
560            _ => panic!("expected CodeBlock"),
561        }
562    }
563
564    #[test]
565    fn build_code_block_line_range_truncated() {
566        let permalink = make_permalink("app.py", Some(LineRange { start: 1, end: 5 }));
567        let body = "a\nb\nc\nd\ne";
568        let result = permalink.build_code_block(body, 3);
569
570        match result {
571            ExpandedContent::CodeBlock { code, metadata, .. } => {
572                assert_eq!(code, "a\nb\nc");
573                assert!(metadata.contains("L1-L5"));
574                assert!(metadata.contains("truncated to 3 lines"));
575            }
576            _ => panic!("expected CodeBlock"),
577        }
578    }
579
580    #[test]
581    fn build_code_block_dockerfile_language() {
582        let permalink = make_permalink("docker/Dockerfile", None);
583        let body = "FROM rust:latest";
584        let result = permalink.build_code_block(body, 50);
585
586        match result {
587            ExpandedContent::CodeBlock { language, .. } => {
588                assert_eq!(language, "dockerfile");
589            }
590            _ => panic!("expected CodeBlock"),
591        }
592    }
593
594    #[test]
595    fn build_code_block_short_commit() {
596        let permalink = GitHubPermalink {
597            owner: "o".to_string(),
598            repo: "r".to_string(),
599            git_ref: "abcd".to_string(),
600            path: "f.rs".to_string(),
601            line_range: None,
602        };
603        let result = permalink.build_code_block("x", 50);
604
605        match result {
606            ExpandedContent::CodeBlock { metadata, .. } => {
607                assert!(metadata.contains("o/r@abcd"));
608            }
609            _ => panic!("expected CodeBlock"),
610        }
611    }
612
613    #[test]
614    fn build_code_block_branch_ref() {
615        let permalink = GitHubPermalink {
616            owner: "o".to_string(),
617            repo: "r".to_string(),
618            git_ref: "main".to_string(),
619            path: "f.rs".to_string(),
620            line_range: None,
621        };
622        let result = permalink.build_code_block("x", 50);
623
624        match result {
625            ExpandedContent::CodeBlock { metadata, .. } => {
626                // Branch names should not be truncated
627                assert!(metadata.contains("o/r@main"));
628            }
629            _ => panic!("expected CodeBlock"),
630        }
631    }
632
633    #[test]
634    fn build_code_block_branch_ref_with_line_range() {
635        let permalink = GitHubPermalink {
636            owner: "o".to_string(),
637            repo: "r".to_string(),
638            git_ref: "develop".to_string(),
639            path: "src/lib.rs".to_string(),
640            line_range: Some(LineRange { start: 3, end: 5 }),
641        };
642        let body = "a\nb\nc\nd\ne\nf";
643        let result = permalink.build_code_block(body, 50);
644
645        match result {
646            ExpandedContent::CodeBlock { code, metadata, .. } => {
647                assert_eq!(code, "c\nd\ne");
648                assert!(metadata.contains("L3-L5"));
649                assert!(metadata.contains("o/r@develop"));
650            }
651            _ => panic!("expected CodeBlock"),
652        }
653    }
654
655    // --- is_commit_sha / shorten_ref ---
656
657    #[test]
658    fn is_commit_sha_valid() {
659        assert!(is_commit_sha("abcd1234"));
660        assert!(is_commit_sha("abcdef1234567890abcdef1234567890abcdef12"));
661    }
662
663    #[test]
664    fn is_commit_sha_boundary() {
665        // Exactly 4 hex chars (minimum)
666        assert!(is_commit_sha("abcd"));
667        // Exactly 40 hex chars (full SHA-1)
668        assert!(is_commit_sha("abcdef1234567890abcdef1234567890abcdef12"));
669    }
670
671    #[test]
672    fn is_commit_sha_invalid() {
673        assert!(!is_commit_sha("main"));
674        assert!(!is_commit_sha("develop"));
675        assert!(!is_commit_sha("abc")); // too short
676        assert!(!is_commit_sha("abcdef1234567890abcdef1234567890abcdef123")); // too long (41)
677        assert!(!is_commit_sha("ghijkl")); // non-hex
678        assert!(!is_commit_sha("")); // empty
679        assert!(is_commit_sha("ABCD1234")); // uppercase hex is still valid hex
680    }
681
682    #[test]
683    fn shorten_ref_commit() {
684        assert_eq!(shorten_ref("abcdef1234567890"), "abcdef1");
685    }
686
687    #[test]
688    fn shorten_ref_short_sha() {
689        // 4-char SHA should not be truncated further
690        assert_eq!(shorten_ref("abcd"), "abcd");
691    }
692
693    #[test]
694    fn shorten_ref_branch() {
695        assert_eq!(shorten_ref("main"), "main");
696        assert_eq!(shorten_ref("feature-branch"), "feature-branch");
697        assert_eq!(shorten_ref("release-v1.0"), "release-v1.0");
698        assert_eq!(shorten_ref("v2.0.0"), "v2.0.0");
699    }
700}