More than 5 years have passed since last update.

青空文庫からダウンロード，解凍，ルビ取り，UTF8化

Last updated at 2016-10-31Posted at 2012-12-26

修正したバージョンに差し替え

青空文庫で公開されている圧縮zipファイルのダウンロードリンク(URL)を文字列として引数に指定するとダウンロードして解凍、ルビなどのメタ情報を削除します。解凍したファイルは実行フォルダに保存され、ルビ処理後のファイルは実行フォルダ内に新規に作成されたNORUBYフォルダに保存されます。関数自体の返り値は、NORUBY ファルダと保存された加工ファイルへの相対パスです。

なお、MacやLinuxで実行した場合、処理の途中で文字コードをUTF-8に変換します。第二引数に処理後のファイル名を指定することもできます。

# ' see http://rmecab.jp/wiki/index.php?RMeCabFunctions#b1945a7c
# ' @param url   txtname
# ' 
Aozora <- function(url = NULL, txtname  = NULL){
    enc <-  switch(.Platform$pkgType, "win.binary" = "CP932", "UTF-8") 
  if (is.null(url)) stop ("specify URL")
  tmp <- unlist (strsplit (url, "/"))
  tmp <- tmp [length (tmp)]
 
  curDir <- getwd()
  tmp <- paste(curDir, tmp, sep = "/")
  download.file (url, tmp)

  textF <- unzip (tmp)
  unlink (tmp)
  
  if(!file.exists (textF)) stop ("something wrong!")
  if (is.null(txtname)) txtname <- paste(unlist(strsplit(basename (textF), ".txt$")))
   if (txtname != "NORUBY")  {

    newDir <- paste(dirname (textF), "NORUBY", sep = "/")

    if (! file.exists (newDir)) dir.create (newDir)

    newFile <- paste (newDir,  "/", txtname, "2.txt", sep = "")

    con <- file(textF, 'r', encoding = "CP932" )
    outfile <- file(newFile, 'w', encoding = enc)
    flag <- 0;
    reg1 <- enc2native ("\U005E\U5E95\U672C")
    reg2 <- enc2native ("\U3010\U5165\U529B\U8005\U6CE8\U3011")
    reg3 <- enc2native ("\UFF3B\UFF03\U005B\U005E\UFF3D\U005D\U002A\UFF3D")
    reg4 <- enc2native ("\U300A\U005B\U005E\U300B\U005D\U002A\U300B")
    reg5 <- enc2native ("\UFF5C")
    while (length(input <- readLines(con, n=1, encoding = "CP932")) > 0){
      if (grepl(reg1, input)) break ;
      if (grepl(reg2, input)) break;
      if (grepl("^------", input)) {
        flag <- !flag
      next;
      }
      if (!flag){
        input <- gsub (reg3, "", input, perl = TRUE)
        input <- gsub (reg4, "", input, perl = TRUE)
        input <- gsub (reg5, "", input, perl = TRUE)
        writeLines(input, con=outfile)
      }
    }
    close(con); close(outfile)
    return (newFile);
  }
}

以下、旧記事

~~朝一の作業で必要になって泥縄で作成したが，これで間に合うだろうか~~

http://rpubs.com/ishida/lesson1

# USAGE:
## source ("Aozora.R")
## 1. arg URL,  2. arg outputfile name
## Aozora ("http://www.aozora.gr.jp/cards/000129/files/673_ruby_23254.zip", "Out") #　カレントディレクトリに ./NORUBY/0ut.txtを生成

Aozora <- function(url = NULL, name  = NULL){
  if (is.null(url)) stop ("specify URL")
  tmp <- unlist (strsplit (url, "/"))
  tmp <- tmp [length (tmp)]
  curDir <- getwd()
  tmp <- paste(curDir, tmp, sep = "/")
  download.file (url, tmp)
  tmp <- unzip (tmp)
  if(!file.exists (tmp)) stop ("something wrong!")
  if (is.null(name)) name <- paste(unlist(strsplit(basename (tmp), ".txt$")))
   if (name != "NORUBY")  {
    newDir <- paste(dirname (tmp), "NORUBY", sep = "/")
    if (! file.exists (newDir)) dir.create (newDir)
    newFile <- paste (newDir,  "/", name, "2.txt", sep = "")
    con <- file(tmp, 'r', encoding = "CP932" )
    outfile <- file(newFile, 'w', encoding = "utf8")
    flag <- 0;
    while (length(input <- readLines(con, n=1, encoding = "CP932")) > 0){
      if (grepl("^底本", input)) break ;
      if (grepl("【入力者注】", input)) break;
      if (grepl("^------", input)) {
        flag <- !flag
      next;
      }
      if (!flag){
        input <- gsub ("［＃[^］]*］", "", input, perl = TRUE)
        input <- gsub ("《[^》]*》", "", input, perl = TRUE)
        input <- gsub ("｜", "", input, perl = TRUE)
        writeLines(input, con=outfile)
      }
    }
    close(con); close(outfile)
    return (newFile);
  }
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up