杂项 (#149)

XiangyunHuang · web-flow · commit f6ddde65306c · 2025-08-18T12:45:13.000+08:00
* 常见的抽样分布

* lung 数据集替换 aml 数据集

* 调用 spacyr 包识别词性，根据词性还原 lemma

* 移动配置 Python 环境的位置

* 代码格式化和抛光
diff --git a/.github/workflows/quarto-book-macos.yaml b/.github/workflows/quarto-book-macos.yaml
@@ -37,6 +37,7 @@ jobs:
         with:
           pandoc-version: ${{ env.PANDOC_VERSION }}
 
+
       - name: Install TinyTeX
         uses: r-lib/actions/setup-tinytex@v2
         env:
@@ -51,6 +52,17 @@ jobs:
           brew install --cask font-noto-sans-cjk-sc font-noto-serif-cjk-sc
           fc-list | sort
           
+
+      - name: Setup Python
+        run: |
+          sudo mkdir -p /opt/.virtualenvs/r-tensorflow
+          sudo chown -R $(whoami):staff /opt/.virtualenvs/r-tensorflow
+          virtualenv -p /usr/bin/python3 $RETICULATE_PYTHON_ENV
+          source $RETICULATE_PYTHON_ENV/bin/activate
+          pip3 install -r docker/requirements.txt
+          python -m spacy download en_core_web_sm
+          deactivate
+          
       - name: Install LaTeX packages
         run: |
           if(!require('tinytex')) install.packages('tinytex')
@@ -62,9 +74,13 @@ jobs:
         
       - name: Render Book
         run: |
+          source $RETICULATE_PYTHON_ENV/bin/activate
           quarto check
           quarto render
         shell: bash
+        env:
+          RETICULATE_PYTHON_ENV: /opt/.virtualenvs/r-tensorflow
+          RETICULATE_PYTHON: /opt/.virtualenvs/r-tensorflow/bin/python
 
       - name: Deploy book to bookdown.org
         if: github.event_name == 'pull_request'
diff --git a/.github/workflows/quarto-book-ubuntu.yaml b/.github/workflows/quarto-book-ubuntu.yaml
@@ -38,6 +38,16 @@ jobs:
         with:
           pandoc-version: ${{ env.PANDOC_VERSION }}
 
+      - name: Setup Python
+        run: |
+          pip3 install virtualenv
+          mkdir -p /opt/.virtualenvs/r-tensorflow
+          virtualenv -p /usr/bin/python3 $RETICULATE_PYTHON_ENV
+          source $RETICULATE_PYTHON_ENV/bin/activate
+          pip3 install -r docker/requirements.txt
+          python -m spacy download en_core_web_sm
+          deactivate
+
       - name: Install TinyTeX
         uses: r-lib/actions/setup-tinytex@v2
         env:
@@ -68,9 +78,13 @@ jobs:
 
       - name: Render Book
         run: |
+          source $RETICULATE_PYTHON_ENV/bin/activate
           quarto check
           quarto render --to html
         shell: bash
+        env:
+          RETICULATE_PYTHON_ENV: /opt/.virtualenvs/r-tensorflow
+          RETICULATE_PYTHON: /opt/.virtualenvs/r-tensorflow/bin/python
 
       - name: Deploy to Github Pages
         uses: JamesIves/github-pages-deploy-action@v4
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -78,6 +78,7 @@ Imports:
     RSQLite,
     scatterplot3d,
     scs,
+    SemNetCleaner,
     sf,
     showtext,
     shiny,
diff --git a/_common.R b/_common.R
@@ -94,3 +94,7 @@ to_png <- function(fig_path) {
   )
   return(png_path)
 }
+
+# 设置 Python
+Sys.setenv(RETICULATE_PYTHON = "/opt/.virtualenvs/r-tensorflow/bin/python")
+Sys.setenv(RETICULATE_PYTHON_ENV = "/opt/.virtualenvs/r-tensorflow")
diff --git a/_quarto.yml b/_quarto.yml
@@ -46,6 +46,7 @@ book:
         - documents-office.qmd
     - part: "统计分析"
       chapters:
+        - sampling-distributions.qmd
         - common-statistical-tests.qmd
         - regression-and-correlation.qmd
         - categorical-data-analysis.qmd
diff --git a/analyze-survival-data.qmd b/analyze-survival-data.qmd
@@ -11,15 +11,17 @@
 
 library(survival)  # survfit
 library(ggplot2)
-library(ggfortify) # autoplot
 library(glmnet)    # Cox Models
-library(VGAM)      # R >= 4.4.0
-library(INLA)
+library(ggsurvfit)
+# library(VGAM)      # R >= 4.4.0
+# library(INLA)
 ```
 
-生存分析可以用于用户流失分析，注册、激活、活跃。 分析次日留存、7日留存、15日留存。有学生来上体验课，多久来付费上课。 有一个人医院看病之后，多久办理住院。 最早，生存分析用于研究飞机出去之后，有多少返回的。还是要回归到原始文献去了解基本概念，及其背后的思考和应用
+生存分析可以用于用户流失分析，注册、激活、活跃。 分析次日留存、7日留存、15日留存。有学生来上体验课，多久来付费上课。 有一个人医院看病之后，多久办理住院。 最早，生存分析用于研究飞机出去之后，有多少还能返回的。生存分析的学习还是要回归到原始文献去了解基本概念，及其背后的思考和应用。
+
+以一个生存问题引出本章主题，讲述和展示一个数据集，先探索和分析数据，之后建立和拟合模型，结果解释。
 
-以一个问题提出本章主题，讲述和展示一个数据集。建立模型，拟合模型，结果解释。
+lung 数据集 **survival** 包（模型）和 **ggsurvfit** 包（可视化）
 
 ## 问题背景 {#sec-aml}
 
@@ -28,7 +30,7 @@ library(INLA)
 ```{r}
 library(survival)
 data(cancer, package = "survival")
-str(aml)
+str(lung)
 ```
 
 数据的分布情况如下
@@ -40,7 +42,7 @@ str(aml)
 #| fig-width: 4.5
 #| fig-height: 3
 
-ggplot(data = aml, aes(x = time, y = status, color = x)) +
+ggplot(data = lung, aes(x = time, y = status, color = factor(sex))) +
   geom_jitter(height = 0.2) +
   theme_minimal()
 ```
@@ -55,46 +57,46 @@ Cox 比例风险回归模型与 Box-Cox 变换 [@Box1964]
 -   `MASS::boxcox()` Box-Cox 变换
 -   `glmnet::glmnet(family = "cox")`
 -   INLA 包的函数 `inla()` 与 `inla.surv()` 一起拟合，[链接](https://becarioprecario.bitbucket.io/inla-gitbook/ch-survival.html)
--   [survstan](https://github.com/fndemarqui/survstan) Stan 与生存分析
--   rstanarm 包的函数 `stan_jm()` 使用说明 Estimating Joint Models for Longitudinal and Time-to-Event Data with rstanarm [链接](https://cran.r-project.org/web/packages/rstanarm/vignettes/jm.html)
--   rstanarm 包的[生存分析分支](https://github.com/stan-dev/rstanarm/pull/323)
 
 ### survival
 
 R 软件内置了 [survival](https://github.com/therneau/survival) 包，它是实现生存分析的核心 R 包 [@Terry2000]，其函数 `survfit()` 拟合模型。
 
 ```{r}
-aml_survival <- survfit(Surv(time, status) ~ x, data = aml)
-summary(aml_survival)
+lung_surv <- survfit(Surv(time, status) ~ sex, data = lung)
+lung_surv
 ```
 
 拟合 Cox 比例风险回归模型（Cox Proportional Hazards Regression Model）
 
 ```{r}
-aml_coxph <- coxph(Surv(time, status) ~ 1 + x, data = aml)
-summary(aml_coxph)
+lung_coxph <- coxph(Surv(time, status) ~ 1 + sex, data = lung)
+summary(lung_coxph)
 ```
 
-展示拟合结果。可以绘制生存分析的图的 R 包有很多，比如 ggfortify 包、[ggsurvfit](https://github.com/ddsjoberg/ggsurvfit/) 包和 [survminer](https://github.com/kassambara/survminer) 包等。ggfortify 包可以直接针对函数 `survfit()` 的返回对象绘图，[ggsurvfit](https://github.com/ddsjoberg/ggsurvfit/) 包提供新函数 `survfit2()` 拟合模型、函数 `ggsurvfit()` 绘制图形，画面内容更加丰富，而 [survminer](https://github.com/kassambara/survminer) 包依赖很多。
+展示拟合结果。可以绘制生存分析的图的 R 包有很多，比如 ggfortify 包、[ggsurvfit](https://github.com/pharmaverse/ggsurvfit) 包和 [survminer](https://github.com/kassambara/survminer) 包等。ggfortify 包可以直接针对函数 `survfit()` 的返回对象绘图，ggsurvfit 包提供新函数 `survfit2()` 拟合模型、函数 `ggsurvfit()` 绘制图形，画面内容更加丰富，而 survminer 包依赖很多。
 
 ```{r}
 #| label: fig-leukemia-surv
 #| fig-cap: "急性粒细胞白血病生存数据"
 #| fig-showtext: true
 #| fig-width: 6
-#| fig-height: 3
-
-library(ggplot2)
-library(ggfortify)
-autoplot(aml_survival, data = aml) +
-  theme_minimal()
-```
-
-参数化的生存分析模型（参数模型，相对于非参数模型而言）
-
-```{r}
-aml_surv_reg <- survreg(Surv(time, status) ~ x, data = aml, dist = "weibull")
-summary(aml_surv_reg)
+#| fig-height: 5
+
+p <- survfit2(Surv(time, status) ~ sex, data = lung) |>
+  ggsurvfit(linewidth = 1) +
+  add_confidence_interval() +
+  add_risktable() +
+  add_quantile(y_value = 0.6, color = "gray50", linewidth = 0.75) +
+  scale_ggsurvfit()
+p +
+  # limit plot to show 8 years and less
+  coord_cartesian(xlim = c(0, 1000)) +
+  # update figure labels/titles
+  labs(
+    y = "Percentage Survival",
+    title = "Recurrence by Time From Surgery to Randomization",
+  )
 ```
 
 ### glmnet
@@ -106,19 +108,22 @@ glmnet 包拟合 Cox 比例风险回归模型 [@simon2011] 适合需要多变量
 
 library(glmnet)
 # alpha = 1 lasso
-aml_glmnet <- glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = "cox", alpha = 1)
-aml_glmnet_cv <- cv.glmnet(x = aml$x, y = Surv(aml$time, aml$status), family = "cox", alpha = 1)
+lung_glmnet <- glmnet(x = lung$sex, y = Surv(lung$time, lung$status), family = "cox", alpha = 1)
+lung_glmnet_cv <- cv.glmnet(x = lung$sex, y = Surv(lung$time, lung$status), family = "cox", alpha = 1)
 ```
 
 ### INLA
 
 INLA 包拟合 Cox 比例风险回归模型 [@Virgilio2020] 采用近似贝叶斯推断。
 
 ```{r}
+#| eval: false
+
 library(INLA)
 inla.setOption(short.summary = TRUE)
-aml_inla <- inla(inla.surv(time, status) ~ x, data = aml, family = "exponential.surv", num.threads = "1:1")
-summary(aml_inla)
+lung_inla <- inla(inla.surv(time, status) ~ sex, data = lung, 
+                  family = "exponential.surv", num.threads = "1:1")
+summary(lung_inla)
 ```
 
 ## Tobit 回归 {#sec-tobit-regression}
@@ -135,17 +140,7 @@ Tobit (Tobin's Probit) regression 起源于计量经济学中的 Tobit 模型，
 
 library(VGAM) # Vector Generalized Linear and Additive Models
 # VGAM::vglm(family = tobit(Upper = 800)) # Tobit regression
-```
-
-```{r}
-library(VGAM)
-with(aml, SurvS4(time, status))
-```
-
-```{r}
-#| eval: false
-#| echo: false
-
-aml_vglm <- vglm(SurvS4(time, status) ~ x, data = aml, family = cens.poisson)
-summary(aml_vglm)
+with(lung, SurvS4(time, status))
+lung_vglm <- vglm(SurvS4(time, status) ~ sex, data = lung, family = cens.poisson)
+summary(lung_vglm)
 ```
diff --git a/analyze-text-data.qmd b/analyze-text-data.qmd
@@ -17,6 +17,8 @@ library(quanteda.textstats) # 查询、统计
 library(quanteda.textmodels) # LSA
 library(ggplot2) # 绘图
 library(text2vec) # LDA 算法
+library(spacyr)
+library(data.table)
 ```
 
 接着，调用 `tools` 包的函数 `CRAN_package_db()` 获取 R 包元数据，为了方便后续重复使用，保存到本地。
@@ -45,15 +47,57 @@ pdb$Title <- gsub(pattern = '"', replacement = "", x = pdb$Title, fixed = T)
 pdb$Title <- tolower(pdb$Title)
 ```
 
--   提取词干和词形还原。这一步比较麻烦，需要根据词性使用不同的规则处理。做名词还原调用 **SemNetCleaner** 包的函数 `singularize()` ，如 models / modeling 还原为 model， methods 还原为 method 等等。
+-   提取词干和词形还原。这一步比较麻烦，需要先使用 **spacyr** 包解析出词性，再根据词性使用不同的规则处理。做名词还原调用 **SemNetCleaner** 包的函数 `singularize()` ，如 models / modeling 还原为 model， methods 还原为 method 等等。
 
 ```{r}
+#| message: false
+
+# 向量化函数 singularize 用于函数 fcase
+vec_singularize <- function(word, ...){
+ unlist(lapply(word, SemNetCleaner::singularize, ...)) 
+}
+vec_singularize(word = c("methods", "models", "data"))
+library(spacyr)
+# OpenMP
+Sys.setenv(KMP_DUPLICATE_LIB_OK = TRUE)
+# 初始化 不需要实体识别
+spacy_initialize(model = "en_core_web_sm", entity = F)
+
+# 准备解析文本向量
+title_desc <- pdb$Title
+names(title_desc) <- pdb$Package
+# 解析文本需要一点时间约 1 分钟
+title_token <- spacy_parse(x = title_desc, entity = F)
+
+# 调用 data.table 操作数据提升效率
+library(data.table)
+title_token <- as.data.table(title_token)
+# 生成新的一列作为 lemma
+title_token$lemma2 <- title_token$lemma
+# 处理动词和名词
+title_token$lemma2 <- fcase(
+  title_token$pos %in% c("VERB", "AUX"), title_token$lemma,
+  title_token$pos %in% c("NOUN", "PROPN", "PRON"), vec_singularize(title_token$token),
+  !title_token$pos %in% c("VERB", "AUX", "NOUN", "PROPN", "PRON"), title_token$token
+)
+# 还原成向量
+pdb <- aggregate(title_token, lemma2 ~ doc_id, paste, collapse = " ")
+colnames(pdb) <- c("Package", "Title")
+# 清理中间变量
+rm(title_token, title_desc)
+```
+
+```{r}
+#| eval: false
+#| code-fold: true
+
 # Token 化之后操作
-# 安装 tidytext 包
 # 名词
-SnowballC::wordStem(words = c("methods", "models"))
+SnowballC::wordStem(words = c("methods", "models", "data"))
 # pdb$Title_stem <- SnowballC::wordStem(pdb$Title)
-tokenizers::tokenize_word_stems(x = c("methods", "models"))
+tokenizers::tokenize_word_stems(x = c("methods", "models", "data"))
+# 调用 SnowballC 包 提取词干
+quanteda::tokens_wordstem(tokens(x = c("methods models data")))
 ```
 
 R 包标题文本的长度分布
@@ -119,7 +163,7 @@ R 语言作为一门主要用于数据获取、分析、处理、建模和可视
 #| code-fold: true
 # 词云
 set.seed(20252025)
-textplot_wordcloud(word1, min_size = 1, max_size = 5)
+textplot_wordcloud(word1, min_size = 0.9, max_size = 5)
 ```
 
 ## 关联词、短语 {#sec-multi-word-expressions}
@@ -155,6 +199,8 @@ word3 |>
   (\(x) x[order(x$count, decreasing = T), ])()
 ```
 
+其中有两个词组 `via windsor.ai api` 和 `amazon web service` 乍一看有点奇怪，其实是这两个公司发布的一系列 R 包导致。
+
 ```{r}
 #| label: fig-frequency-phrase
 #| fig-cap: 高频短语
diff --git a/sampling-distributions.qmd b/sampling-distributions.qmd

Original file line number	Diff line number	Diff line change
`@@ -94,3 +94,7 @@ to_png <- function(fig_path) {`
`94`	`94`	`)`
`95`	`95`	`return(png_path)`
`96`	`96`	`}`
	`97`	`+`
	`98`	`+# 设置 Python`
	`99`	`+Sys.setenv(RETICULATE_PYTHON = "/opt/.virtualenvs/r-tensorflow/bin/python")`
	`100`	`+Sys.setenv(RETICULATE_PYTHON_ENV = "/opt/.virtualenvs/r-tensorflow")`