Skip to main content

babyrite/expand/
github.rs

1//! GitHub Permalink expansion.
2//!
3//! This module provides functionality for parsing GitHub permalink URLs
4//! and fetching raw file content to display as code blocks.
5
6use regex::Regex;
7use std::collections::HashSet;
8use std::sync::LazyLock;
9
10use super::{ExpandError, ExpandedContent};
11use crate::config::BabyriteConfig;
12use crate::utils::language_from_extension;
13
14/// Regex pattern for matching GitHub permalink URLs.
15///
16/// Captures: owner, repo, commit, path, and optional line range fragment.
17///
18/// Supported patterns:
19/// - `https://github.com/{owner}/{repo}/blob/{commit}/{path}`
20/// - `https://github.com/{owner}/{repo}/blob/{commit}/{path}#L{line}`
21/// - `https://github.com/{owner}/{repo}/blob/{commit}/{path}#L{start}-L{end}`
22static GITHUB_PERMALINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
23    Regex::new(
24        r"https://github\.com/([^/]+)/([^/]+)/blob/([0-9a-f]{4,40})/([^#\s]+)(?:#L(\d+)(?:-L(\d+))?)?"
25    )
26    .unwrap()
27});
28
29/// A parsed GitHub permalink.
30#[derive(Debug)]
31pub struct GitHubPermalink {
32    /// Repository owner.
33    pub owner: String,
34    /// Repository name.
35    pub repo: String,
36    /// Commit SHA.
37    pub commit: String,
38    /// File path within the repository.
39    pub path: String,
40    /// Optional line range specification.
41    pub line_range: Option<LineRange>,
42}
43
44/// A line range extracted from a GitHub permalink fragment.
45#[derive(Debug, Clone, Copy)]
46pub struct LineRange {
47    /// Start line (1-indexed).
48    pub start: usize,
49    /// End line (1-indexed, inclusive). Same as `start` for single-line references.
50    pub end: usize,
51}
52
53/// Errors that can occur when expanding a GitHub permalink.
54#[derive(thiserror::Error, Debug)]
55pub enum GitHubExpandError {
56    /// Failed to fetch the raw file content.
57    #[error("Failed to fetch raw content: {0}")]
58    Fetch(String),
59    /// The fetched content exceeds the maximum allowed size.
60    #[error("Content exceeds size limit")]
61    ContentTooLarge,
62    /// An HTTP error occurred.
63    #[error(transparent)]
64    Http(#[from] reqwest::Error),
65}
66
67impl GitHubPermalink {
68    /// Parses all GitHub permalink URLs from the given text.
69    ///
70    /// Only matches URLs with a commit SHA (not branch names), ensuring
71    /// only true permalinks are expanded.
72    ///
73    /// Note: Duplicate URLs are ignored, and a maximum of 3 links are returned.
74    pub fn parse_all(text: &str) -> Vec<GitHubPermalink> {
75        let mut seen_urls = HashSet::new();
76        GITHUB_PERMALINK_REGEX
77            .captures_iter(text)
78            .filter_map(|captures| {
79                let m = captures.get(0)?;
80                let full_url = m.as_str();
81                // Skip URLs wrapped in angle brackets (e.g., <https://...>)
82                if m.start() > 0 && text.as_bytes()[m.start() - 1] == b'<' {
83                    return None;
84                }
85                if !seen_urls.insert(full_url.to_string()) {
86                    return None;
87                }
88
89                let owner = captures.get(1)?.as_str().to_string();
90                let repo = captures.get(2)?.as_str().to_string();
91                let commit = captures.get(3)?.as_str().to_string();
92                let path = captures.get(4)?.as_str().to_string();
93
94                let line_range = match (captures.get(5), captures.get(6)) {
95                    (Some(start), Some(end)) => {
96                        let s = start.as_str().parse().ok()?;
97                        let e = end.as_str().parse().ok()?;
98                        Some(LineRange { start: s, end: e })
99                    }
100                    (Some(start), None) => {
101                        let s = start.as_str().parse().ok()?;
102                        Some(LineRange { start: s, end: s })
103                    }
104                    _ => None,
105                };
106
107                Some(GitHubPermalink {
108                    owner,
109                    repo,
110                    commit,
111                    path,
112                    line_range,
113                })
114            })
115            .take(3) // Limit to maximum 3 links
116            .collect()
117    }
118
119    /// Fetches the raw file content from GitHub and returns a code block.
120    pub async fn fetch(
121        &self,
122        http_client: &reqwest::Client,
123    ) -> Result<ExpandedContent, ExpandError> {
124        let config = BabyriteConfig::get();
125        let max_lines = config.github.max_lines;
126
127        let raw_url = format!(
128            "https://raw.githubusercontent.com/{}/{}/{}/{}",
129            self.owner, self.repo, self.commit, self.path
130        );
131
132        let response = http_client
133            .get(&raw_url)
134            .send()
135            .await
136            .map_err(GitHubExpandError::Http)?;
137
138        if !response.status().is_success() {
139            return Err(GitHubExpandError::Fetch(format!(
140                "HTTP {} for {}",
141                response.status(),
142                raw_url
143            ))
144            .into());
145        }
146
147        let content_length = response.content_length().unwrap_or(0);
148        // 1 MB limit to avoid fetching huge files
149        if content_length > 1_048_576 {
150            return Err(GitHubExpandError::ContentTooLarge.into());
151        }
152
153        let body = response.text().await.map_err(GitHubExpandError::Http)?;
154
155        Ok(self.build_code_block(&body, max_lines))
156    }
157
158    /// Builds an `ExpandedContent::CodeBlock` from raw file content.
159    fn build_code_block(&self, body: &str, max_lines: usize) -> ExpandedContent {
160        let all_lines: Vec<&str> = body.lines().collect();
161        let (code, line_info) = match self.line_range {
162            Some(range) => {
163                let start = range.start.saturating_sub(1); // 0-indexed
164                let end = range.end.min(all_lines.len());
165                let selected: Vec<&str> = all_lines.get(start..end).unwrap_or_default().to_vec();
166
167                let (code, truncated) = truncate_lines(&selected, max_lines);
168                let info = if truncated {
169                    format!(
170                        "L{}-L{}, truncated to {} lines",
171                        range.start, range.end, max_lines
172                    )
173                } else {
174                    format!("L{}-L{}", range.start, range.end)
175                };
176                (code, info)
177            }
178            None => {
179                let (code, truncated) = truncate_lines(&all_lines, max_lines);
180                let info = if truncated {
181                    format!("truncated to {} lines", max_lines)
182                } else {
183                    String::new()
184                };
185                (code, info)
186            }
187        };
188
189        let short_commit = &self.commit[..7.min(self.commit.len())];
190        let language = language_for_path(&self.path);
191
192        let metadata = if line_info.is_empty() {
193            format!(
194                "`{}` - {}/{}@{}",
195                self.path, self.owner, self.repo, short_commit
196            )
197        } else {
198            format!(
199                "`{}` ({}) - {}/{}@{}",
200                self.path, line_info, self.owner, self.repo, short_commit
201            )
202        };
203
204        ExpandedContent::CodeBlock {
205            language: language.to_string(),
206            code,
207            metadata,
208        }
209    }
210}
211
212/// Extracts the filename from a path and returns the appropriate language identifier.
213fn language_for_path(path: &str) -> &str {
214    let filename = path.rsplit('/').next().unwrap_or(path);
215    match filename.rsplit_once('.') {
216        Some((_, ext)) => language_from_extension(ext),
217        None => language_from_extension(filename),
218    }
219}
220
221/// Truncates lines to the given maximum, returning the joined string and whether truncation occurred.
222fn truncate_lines(lines: &[&str], max: usize) -> (String, bool) {
223    if lines.len() > max {
224        let truncated: Vec<&str> = lines[..max].to_vec();
225        (truncated.join("\n"), true)
226    } else {
227        (lines.join("\n"), false)
228    }
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    // --- truncate_lines ---
236
237    #[test]
238    fn truncate_lines_under_limit() {
239        let lines = vec!["a", "b", "c"];
240        let (result, truncated) = truncate_lines(&lines, 5);
241        assert_eq!(result, "a\nb\nc");
242        assert!(!truncated);
243    }
244
245    #[test]
246    fn truncate_lines_at_limit() {
247        let lines = vec!["a", "b", "c"];
248        let (result, truncated) = truncate_lines(&lines, 3);
249        assert_eq!(result, "a\nb\nc");
250        assert!(!truncated);
251    }
252
253    #[test]
254    fn truncate_lines_over_limit() {
255        let lines = vec!["a", "b", "c", "d", "e"];
256        let (result, truncated) = truncate_lines(&lines, 2);
257        assert_eq!(result, "a\nb");
258        assert!(truncated);
259    }
260
261    #[test]
262    fn truncate_lines_empty() {
263        let lines: Vec<&str> = vec![];
264        let (result, truncated) = truncate_lines(&lines, 5);
265        assert_eq!(result, "");
266        assert!(!truncated);
267    }
268
269    // --- GitHubPermalink::parse_all ---
270
271    #[test]
272    fn parse_basic_permalink() {
273        let text = "https://github.com/owner/repo/blob/abcdef1234567890abcdef1234567890abcdef12/src/main.rs";
274        let results = GitHubPermalink::parse_all(text);
275        assert_eq!(results.len(), 1);
276        assert_eq!(results[0].owner, "owner");
277        assert_eq!(results[0].repo, "repo");
278        assert_eq!(
279            results[0].commit,
280            "abcdef1234567890abcdef1234567890abcdef12"
281        );
282        assert_eq!(results[0].path, "src/main.rs");
283        assert!(results[0].line_range.is_none());
284    }
285
286    #[test]
287    fn parse_permalink_with_single_line() {
288        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L42";
289        let results = GitHubPermalink::parse_all(text);
290        assert_eq!(results.len(), 1);
291        let range = results[0].line_range.unwrap();
292        assert_eq!(range.start, 42);
293        assert_eq!(range.end, 42);
294    }
295
296    #[test]
297    fn parse_permalink_with_line_range() {
298        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L10-L20";
299        let results = GitHubPermalink::parse_all(text);
300        assert_eq!(results.len(), 1);
301        let range = results[0].line_range.unwrap();
302        assert_eq!(range.start, 10);
303        assert_eq!(range.end, 20);
304    }
305
306    #[test]
307    fn parse_rejects_branch_name() {
308        // Branch names (non-hex) should not match
309        let text = "https://github.com/owner/repo/blob/main/src/lib.rs";
310        let results = GitHubPermalink::parse_all(text);
311        assert!(results.is_empty());
312    }
313
314    #[test]
315    fn parse_rejects_short_sha() {
316        // SHA must be at least 4 hex characters
317        let text = "https://github.com/owner/repo/blob/abc/src/lib.rs";
318        let results = GitHubPermalink::parse_all(text);
319        assert!(results.is_empty());
320    }
321
322    #[test]
323    fn parse_deduplicates_urls() {
324        let text = "https://github.com/owner/repo/blob/abcd1234/src/lib.rs \
325                    https://github.com/owner/repo/blob/abcd1234/src/lib.rs";
326        let results = GitHubPermalink::parse_all(text);
327        assert_eq!(results.len(), 1);
328    }
329
330    #[test]
331    fn parse_limits_to_three() {
332        let text = "\
333            https://github.com/o/r/blob/aaaa1111/a.rs \
334            https://github.com/o/r/blob/bbbb2222/b.rs \
335            https://github.com/o/r/blob/cccc3333/c.rs \
336            https://github.com/o/r/blob/dddd4444/d.rs";
337        let results = GitHubPermalink::parse_all(text);
338        assert_eq!(results.len(), 3);
339    }
340
341    #[test]
342    fn parse_multiple_different_urls() {
343        let text = "Check https://github.com/a/b/blob/1111aaaa/x.rs#L1 and \
344                    https://github.com/c/d/blob/2222bbbb/y.py#L5-L10";
345        let results = GitHubPermalink::parse_all(text);
346        assert_eq!(results.len(), 2);
347        assert_eq!(results[0].owner, "a");
348        assert_eq!(results[1].owner, "c");
349        assert_eq!(results[1].path, "y.py");
350    }
351
352    #[test]
353    fn parse_no_match() {
354        let text = "Hello, no links here!";
355        let results = GitHubPermalink::parse_all(text);
356        assert!(results.is_empty());
357    }
358
359    #[test]
360    fn parse_ignores_angle_bracket_link() {
361        let text = "<https://github.com/owner/repo/blob/abcd1234/src/lib.rs#L10-L20>";
362        let results = GitHubPermalink::parse_all(text);
363        assert!(results.is_empty());
364    }
365
366    #[test]
367    fn parse_nested_path() {
368        let text = "https://github.com/owner/repo/blob/abcd1234/src/deeply/nested/path/file.rs";
369        let results = GitHubPermalink::parse_all(text);
370        assert_eq!(results.len(), 1);
371        assert_eq!(results[0].path, "src/deeply/nested/path/file.rs");
372    }
373
374    #[test]
375    fn parse_short_commit_sha() {
376        // 4-character SHA is the minimum
377        let text = "https://github.com/owner/repo/blob/abcd/file.rs";
378        let results = GitHubPermalink::parse_all(text);
379        assert_eq!(results.len(), 1);
380        assert_eq!(results[0].commit, "abcd");
381    }
382
383    // --- language_for_path ---
384
385    #[test]
386    fn language_for_path_basic_extension() {
387        assert_eq!(language_for_path("src/main.rs"), "rust");
388    }
389
390    #[test]
391    fn language_for_path_dockerfile_in_subdir() {
392        assert_eq!(language_for_path("docker/Dockerfile"), "dockerfile");
393    }
394
395    #[test]
396    fn language_for_path_dotted_directory() {
397        assert_eq!(language_for_path("some.config/Dockerfile"), "dockerfile");
398    }
399
400    #[test]
401    fn language_for_path_makefile_in_subdir() {
402        assert_eq!(language_for_path("build/Makefile"), "makefile");
403    }
404
405    #[test]
406    fn language_for_path_multiple_dots() {
407        assert_eq!(language_for_path("file.test.ts"), "typescript");
408    }
409
410    #[test]
411    fn language_for_path_dotfile() {
412        assert_eq!(language_for_path(".gitignore"), "gitignore");
413    }
414
415    // --- build_code_block ---
416
417    fn make_permalink(path: &str, line_range: Option<LineRange>) -> GitHubPermalink {
418        GitHubPermalink {
419            owner: "owner".to_string(),
420            repo: "repo".to_string(),
421            commit: "abcdef1234567".to_string(),
422            path: path.to_string(),
423            line_range,
424        }
425    }
426
427    #[test]
428    fn build_code_block_full_file() {
429        let permalink = make_permalink("src/main.rs", None);
430        let body = "fn main() {\n    println!(\"hello\");\n}";
431        let result = permalink.build_code_block(body, 50);
432
433        match result {
434            ExpandedContent::CodeBlock {
435                language,
436                code,
437                metadata,
438            } => {
439                assert_eq!(language, "rust");
440                assert_eq!(code, body);
441                assert_eq!(metadata, "`src/main.rs` - owner/repo@abcdef1");
442            }
443            _ => panic!("expected CodeBlock"),
444        }
445    }
446
447    #[test]
448    fn build_code_block_with_line_range() {
449        let permalink = make_permalink("src/lib.rs", Some(LineRange { start: 2, end: 3 }));
450        let body = "line1\nline2\nline3\nline4";
451        let result = permalink.build_code_block(body, 50);
452
453        match result {
454            ExpandedContent::CodeBlock {
455                language,
456                code,
457                metadata,
458            } => {
459                assert_eq!(language, "rust");
460                assert_eq!(code, "line2\nline3");
461                assert!(metadata.contains("L2-L3"));
462            }
463            _ => panic!("expected CodeBlock"),
464        }
465    }
466
467    #[test]
468    fn build_code_block_truncated() {
469        let permalink = make_permalink("app.py", None);
470        let body = "a\nb\nc\nd\ne";
471        let result = permalink.build_code_block(body, 2);
472
473        match result {
474            ExpandedContent::CodeBlock { code, metadata, .. } => {
475                assert_eq!(code, "a\nb");
476                assert!(metadata.contains("truncated to 2 lines"));
477            }
478            _ => panic!("expected CodeBlock"),
479        }
480    }
481
482    #[test]
483    fn build_code_block_line_range_truncated() {
484        let permalink = make_permalink("app.py", Some(LineRange { start: 1, end: 5 }));
485        let body = "a\nb\nc\nd\ne";
486        let result = permalink.build_code_block(body, 3);
487
488        match result {
489            ExpandedContent::CodeBlock { code, metadata, .. } => {
490                assert_eq!(code, "a\nb\nc");
491                assert!(metadata.contains("L1-L5"));
492                assert!(metadata.contains("truncated to 3 lines"));
493            }
494            _ => panic!("expected CodeBlock"),
495        }
496    }
497
498    #[test]
499    fn build_code_block_dockerfile_language() {
500        let permalink = make_permalink("docker/Dockerfile", None);
501        let body = "FROM rust:latest";
502        let result = permalink.build_code_block(body, 50);
503
504        match result {
505            ExpandedContent::CodeBlock { language, .. } => {
506                assert_eq!(language, "dockerfile");
507            }
508            _ => panic!("expected CodeBlock"),
509        }
510    }
511
512    #[test]
513    fn build_code_block_short_commit() {
514        let permalink = GitHubPermalink {
515            owner: "o".to_string(),
516            repo: "r".to_string(),
517            commit: "abcd".to_string(),
518            path: "f.rs".to_string(),
519            line_range: None,
520        };
521        let result = permalink.build_code_block("x", 50);
522
523        match result {
524            ExpandedContent::CodeBlock { metadata, .. } => {
525                assert!(metadata.contains("o/r@abcd"));
526            }
527            _ => panic!("expected CodeBlock"),
528        }
529    }
530}