Skip to content

Commit

Permalink
20200402 添加datacamp解析功能
Browse files Browse the repository at this point in the history
  • Loading branch information
univerone committed Apr 2, 2020
1 parent 92f1e6e commit 4442059
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 29 deletions.
55 changes: 33 additions & 22 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Icouse-Downloader
icourse-downloader可以根据课程链接下载[爱课程网](https://www.icourses.cn/home/), [华文慕课](http://www.chinesemooc.org)以及[中国大学MOOC](https://www.icourse163.org) 上的视频以及课件文档等
icourse-downloader可以根据课程链接下载[爱课程网](https://www.icourses.cn/home/)[华文慕课](http://www.chinesemooc.org)[中国大学MOOC](https://www.icourse163.org) 以及
[datacamp](https://www.datacamp.com/)上的视频以及课件文档等

![](https://img.shields.io/github/repo-size/webscrapingproject/icourse-downloader) ![](https://img.shields.io/github/v/release/webscrapingproject/icourse-downloader)

Expand All @@ -18,6 +19,10 @@ icourse-downloader可以根据课程链接下载[爱课程网](https://www.icour
```bash
./icourse https://www.icourse163.org/learn/SDU-1001907001?tid=1003113029
```
获取datacamp的课程资料下载链接
```bash
./icourse https://learn.datacamp.com/courses/data-visualization-with-ggplot2-part-3
```
## 2. 参数说明
### 2.1 爱课程下载参数
```
Expand Down Expand Up @@ -58,4 +63,15 @@ all 下载全部内容
video 仅下载课件视频
videoPPT 仅下载课程课件
RichText 仅下载课程富文本附件
```
```
### 2.4 datacamp下载参数
```
icourse -o <outputPath> <url-of-datacamp>
```
默认获得视频文件、字幕文件以及课件文件的下载地址保存在文件中,并生成重命名的批处理文件

## 相关博文

1. https://blog.univerone.com/post/34-datacamp-video-download/

2. https://blog.univerone.com/post/23-go-icourse-downloader/
2 changes: 1 addition & 1 deletion config/version.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package config

// 版本号
const VERSION = "0.0.3"
const VERSION = "1.0.0"
Binary file removed icourse
Binary file not shown.
Binary file removed icourse-darwin
Binary file not shown.
Binary file removed icourse-win
Binary file not shown.
11 changes: 7 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package main

import (
"flag"
"fmt"
"icourse/config"
"icourse/parser"
"icourse/utils"
"flag"
"fmt"
"os"
"strings"
)
Expand All @@ -16,7 +16,7 @@ import (
func init() {
flag.BoolVar(&config.Version, "v", false, "Show version")
//all为全部下载,most为视频课件以及试卷,也为下载默认选项,videoPPT仅下载视频和课件,exams为仅下载试卷,resources仅下载其它资源
flag.StringVar(&config.ContentOptions, "co", "all", "Only for icourse : Specify the download content {all,most,videoPPT,assignments,testPaper,shareResource}\nOnly for chinesemooc : Specify the download content {all, video , PPT}\nOnly for Icourse163 : Specify the download content {all, video , PPT , RichText}")
flag.StringVar(&config.ContentOptions, "co", "all", "Only for icourse : Specify the download content {all,most,videoPPT,assignments,testPaper,shareResource}\nOnly for chinesemooc : Specify the download content {all, video , PPT}\nOnly for Icourse163 : Specify the download content {all, video , PPT , RichText}\nOnly for Datacamp : all the content links will be extracted")
//华文慕课的下载选项,只有三个:全部下载(默认),只下载视频 以及 只下载课件
//中国大学mooc的下载选项:全部下载(默认),只下载视频,只下载课件以及只下载富文本文件

Expand All @@ -36,6 +36,9 @@ func download(url string) bool {
parser.DownloadChinesemooc(url,config.ContentOptions,config.Cookie)
case "icourse163":
parser.DownloadIcourse163(url,config.ContentOptions)

case "datacamp":
parser.DownloadDatacamp(url,config.ContentOptions)
default:
fmt.Println("The website is not supported now ")
return false
Expand All @@ -45,7 +48,7 @@ func download(url string) bool {

func main() {
//此处参考了annie的代码
//fmt.Println(parser.GetStartURLs("http://www.chinesemooc.org/mooc/4880","pku_auth=161evS%2BQJtmq%2FGJRyU%2BFhfaNLyG88SrUPqUX5a0eOUW49JVtBaPxY7lt1vp2MvvcC9UaH8qYx3%2B0cSja0MeVNCmDSWRQ; pku_loginuser=univeroner%40gmail.com; pku_reward_log=daylogin%2C1173273; Hm_lvt_ff4f6e9862a4e0e16fd1f5a7f6f8953b=1569321857,1569494843,1569494850,1569759380; PHPSESSID=p72d5gqftbmp65mmr2n9ghrah5; pku__refer=%252Fmooc%252F4880; Hm_lpvt_ff4f6e9862a4e0e16fd1f5a7f6f8953b=1569761588"))
//fmt.Println(parser.GetDCStartURLs("https://campus.datacamp.com/courses/data-visualization-with-ggplot2-2"))

flag.Parse()
args := flag.Args()
Expand Down
114 changes: 114 additions & 0 deletions parser/datacamp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package parser

import (
"fmt"
"icourse/utils"
)

//// Datacamp下载部分
func DownloadDatacamp(url string,options string){
courseName := GetDCName(url)
fmt.Println(courseName)
ExtractDCVideo(url)
//switch options{
//case "all":
// DownloadDCAll(url,courseName)
//case "video":
// DownloadDCVideos(url,courseName)
//case "subtitle":
// DownloadDCsubtitles(url,courseName)
//}

}


//得到课程名称
func GetDCName(url string)string {
return utils.MatchAll(url,`courses/(.*)`)[0][1]
}

//从起始地址得到带视频章节的下载页面,返回视频列表和pdf课件列表
func GetDCStartURLs(starturl string) ([]string,[]string) {
//从起始地址中提取课程名称
courseName := GetDCName(starturl)
//拼接出正确的地址
url := "https://campus.datacamp.com/courses/"+courseName
data:=utils.HttpGet(url)
//fmt.Print(url)
var urlList []string
var pdfList []string
content := utils.MatchAll(data,`VideoExercise.*?(https://campus.datacamp.com/courses/.*?)&quot;`)
ppts := utils.MatchAll(data,`slides_link.*?(https.*?pdf)`)
//fmt.Print(ppts)
for i,item := range(content){
if(i< len(content)/2) {
urlList = append(urlList, item[1])
}
}
//pdfList 有重复的
for i,item := range(ppts){
if(i< len(content)/2) {
pdfList = append(pdfList, item[1])
}
}
return urlList,pdfList
}

//提取单个视频的projector_key
func GetPK(url string)(string){
data:=utils.HttpGet(url)
PK := utils.MatchAll(data,`(https://projector.datacamp.com/\?projector_key=.*?)&quot;`)[0][1]
return PK
}

//根据project key 返回视频和字幕的下载地址
func GetDCVideo(url string)(string,string){
data:=utils.HttpGet(url)
videoUrl:= "https:"+utils.MatchAll(data,`video_mp4_link.*?(//videos.datacamp.com/transcoded_mp4/.*?mp4)`)[0][1]
subtitleUrl := utils.MatchAll(data,`subtitle_vtt_link.*?(https.*?vtt)`)[0][1]
return videoUrl,subtitleUrl
}

////返回需要下载的pdf文件地址
//func ExtractDCPPTs(url string)([]utils.File){
// _,pdfList := GetDCStartURLs(url)
// var pdfs []utils.File
// //for _,item :=range(pdfList){
// // fileName := utils.MatchAll(item,`.*/(.*?pdf)`)[0][1]
// // pdfs = append(pdfs, utils.File{item,fileName})
// //}
// utils.WriteFile("pdflist.txt",pdfList)
// return pdfs
//}

//返回待下载的视频文件列表和字幕文件列表
func ExtractDCVideo(url string)([]utils.File,[]utils.File){
//需要下载的文件集合
var videos []utils.File
var subtitles []utils.File
urlList,pdfList := GetDCStartURLs(url)
//放置下载信息的列表
var downloadList []string
var renameList []string
//fmt.Println(urlList)
//对于每一个视频
for _,item := range(urlList){
PK := GetPK(item)
//添加下载链接
videoUrl,subtitleUrl := GetDCVideo(PK)
downloadList = append(downloadList,videoUrl )
downloadList = append(downloadList,subtitleUrl )
fileName := utils.MatchAll(videoUrl,`.*/(.*?).mp4`)[0][1]
subtitleName := utils.MatchAll(subtitleUrl,`.*/(.*?.vtt)`)[0][1]
//添加重命名命令
renameCM := "ren "+subtitleName+" "+fileName+".vtt"
fmt.Println(renameCM)
renameList = append(renameList,renameCM)
videos = append(videos, utils.File{videoUrl,fileName+".mp4"})
subtitles = append(subtitles, utils.File{subtitleUrl,fileName+".vtt"})
}
utils.WriteFile("downloadList.txt",downloadList)
utils.WriteFile("rename.bat",renameList)
utils.WriteFile("pdflist.txt",pdfList)
return videos,subtitles
}
30 changes: 30 additions & 0 deletions utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,28 @@ import (
"strconv"
"strings"
)
//将字符串写入某个文件
func WriteFile(fileName string,content []string){
f, err := os.Create(fileName)
if err != nil {
fmt.Println(err)
f.Close()
return
}
for _, v := range content {
_, err:=fmt.Fprintln(f, v)
if err != nil {
fmt.Println(err)
return
}
}
err = f.Close()
if err != nil {
fmt.Println(err)
return
}
}


//unicode 转换为utf8
func Unicode2utf8(source string) string {
Expand Down Expand Up @@ -120,6 +142,14 @@ func Domain(url string) string {

//根据url,构造get请求
func HttpGet(s string) string {
//proxy := func(_ *http.Request) (*url.URL, error) {
// return url.Parse("http://127.0.0.1:8888")
//}
//
//transport := &http.Transport{Proxy: proxy}
//
//client := &http.Client{Transport: transport}
//res, err := client.Get(s)
res, err := http.Get(s)
if err != nil {
log.Fatal(err)
Expand Down

0 comments on commit 4442059

Please sign in to comment.